Initial check-in
Based on Chromium revision f3b63e7356ad0846045fe69dd640781e95728486
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..13d16fb
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,1090 @@
+# Names should be added to this file with this pattern:
+#
+# For individuals:
+# Name <email address>
+#
+# For organizations:
+# Organization <fnmatch pattern>
+#
+# See python fnmatch module documentation for more information.
+
+Aaron Boushley <boushley@gmail.com>
+Aaron Jacobs <samusaaron3@gmail.com>
+Aaron Leventhal <aaronlevbugs@gmail.com>
+Aaron Randolph <aaron.randolph@gmail.com>
+Aaryaman Vasishta <jem456.vasishta@gmail.com>
+Abdu Ameen <abdu.ameen000@gmail.com>
+Abhijeet Kandalkar <abhijeet.k@samsung.com>
+Abhishek Agarwal <abhishek.a21@samsung.com>
+Abhishek Kanike <abhishek.ka@samsung.com>
+Abhishek Singh <abhi.rathore@samsung.com>
+Adam Bonner <abonner-chromium@solscope.com>
+Adam Bujalski <abujalski@gmail.com>
+Adam Kallai <kadam@inf.u-szeged.hu>
+Adam Roben <adam@github.com>
+Adam Treat <adam.treat@samsung.com>
+Adam Yi <i@adamyi.com>
+Addanki Gandhi Kishor <kishor.ag@samsung.com>
+Adenilson Cavalcanti <a.cavalcanti@samsung.com>
+Aditya Bhargava <heuristicist@gmail.com>
+Adrian Belgun <adrian.belgun@intel.com>
+Ahmet Emir Ercin <ahmetemiremir@gmail.com>
+Ajay Berwal <a.berwal@samsung.com>
+Ajay Berwal <ajay.berwal@samsung.com>
+Ajith Kumar V <ajith.v@samsung.com>
+Aku Kotkavuo <a.kotkavuo@partner.samsung.com>
+Aldo Culquicondor <alculquicondor@gmail.com>
+Aleksandar Stojiljkovic <aleksandar.stojiljkovic@intel.com>
+Alex Gabriel <minilogo@gmail.com>
+Alex Gartrell <agartrell@cmu.edu>
+Alex Henrie <alexhenrie24@gmail.com>
+Alex Scheele <alexscheele@gmail.com>
+Alexander Douglas <agdoug@amazon.com>
+Alexander Guettler <alexander@guettler.io>
+Alexander Shalamov <alexander.shalamov@intel.com>
+Alexander Sulfrian <alexander@sulfrian.net>
+Alexandre Abreu <wiss1976@gmail.com>
+Alexandru Chiculita <achicu@adobe.com>
+Alexey Korepanov <alexkorep@gmail.com>
+Alexey Kuts <kruntuid@gmail.com>
+Alexey Kuzmin <alex.s.kuzmin@gmail.com>
+Alexey Kuznetsov <saturas2000@gmail.com>
+Alexis Brenon <brenon.alexis@gmail.com>
+Alexis La Goutte <alexis.lagoutte@gmail.com>
+Alexis Menard <alexis.menard@intel.com>
+Alfredo Hernandez <ahernandez.miralles@gmail.com>
+Ali Vathi <ali.akbar@gmail.com>
+Allan Sandfeld Jensen <allan.jensen@qt.io>
+Alper Çakan <alpercakan98@gmail.com>
+Ambarish Rapte <ambarish.r@samsung.com>
+Amey Jahagirdar <jahagird@amazon.com>
+Amit Sarkar <amit.srkr@samsung.com>
+Amogh Bihani <amogh.bihani@samsung.com>
+Amos Lim <amoseui@gmail.com>
+Amos Lim <eui-sang.lim@samsung.com>
+Amruth Raj <amruthraj@motorola.com>
+Amruth Raj <ckqr36@motorola.com>
+Anand Ratn <anand.ratn@samsung.com>
+Anastasios Cassiotis <tom.cassiotis@gmail.com>
+anatoly techtonik <techtonik@gmail.com>
+Ancil George <ancilgeorge@samsung.com>
+Andra Paraschiv <andra.paraschiv@intel.com>
+Andrei Borza <andrei.borza@gmail.com>
+Andrei Parvu <andrei.prv@gmail.com>
+Andrei Parvu <parvu@adobe.com>
+Andrew Boyarshin <andrew.boyarshin@gmail.com>
+Andrew Brampton <me@bramp.net>
+Andrew Hung <andrhung@amazon.com>
+Andrew Jorgensen <ajorgens@amazon.com>
+Andrew MacPherson <andrew.macpherson@soundtrap.com>
+Andrew Tulloch <andrew@tullo.ch>
+Anish Patankar <anish.p@samsung.com>
+Ankit Kumar <ankit2.kumar@samsung.com>
+Ankur Verma <ankur1.verma@samsung.com>
+Anna Henningsen <anna@addaleax.net>
+Anne Kao <annekao94@gmail.com>
+Anssi Hannula <anssi.hannula@iki.fi>
+Anthony Halliday <anth.halliday12@gmail.com>
+Anton Obzhirov <a.obzhirov@samsung.com>
+Antonin Hildebrand <antonin.hildebrand@gmail.com>
+Antonio Gomes <a1.gomes@sisa.samsung.com>
+Anuj Kumar Sharma <anujk.sharma@samsung.com>
+Arjun Karthik <arjunkar@amazon.com>
+Arman Ghotb <armanghotb@gmail.com>
+Armin Burgmeier <aburgmeier@bloomberg.net>
+Arnaud Mandy <arnaud.mandy@intel.com>
+Arnaud Renevier <a.renevier@samsung.com>
+Arpita Bahuguna <a.bah@samsung.com>
+Arthur Lussos <developer0420@gmail.com>
+Arun Kulkarni <kulkarni.a@samsung.com>
+Arun Kumar <arun87.kumar@samsung.com>
+Arun Mankuzhi <arun.m@samsung.com>
+Arunoday Sarkar <a.sarkar.arun@gmail.com>
+Arunprasad Rajkumar <ararunprasad@gmail.com>
+Arunprasad Rajkumar <arurajku@cisco.com>
+Asami Doi <d0iasm.pub@gmail.com>
+Ashish Kumar Gupta <guptaag@amazon.com>
+Ashlin Joseph <ashlin.j@samsung.com>
+Asish Singh <asish.singh@samsung.com>
+Attila Dusnoki <dati91@gmail.com>
+Avinaash Doreswamy <avi.nitk@samsung.com>
+Ayush Khandelwal <k.ayush@samsung.com>
+Azhar Shaikh <azhar.shaikh@intel.com>
+Balazs Kelemen <b.kelemen@samsung.com>
+Baul Eun <baul.eun@samsung.com>
+Behara Mani Shyam Patro <behara.ms@samsung.com>
+Bem Jones-Bey <bemajaniman@gmail.com>
+Bem Jones-Bey <bjonesbe@adobe.com>
+Ben Coe <bencoe@gmail.com>
+Ben Fiola <benfiola@gmail.com>
+Ben Karel <eschew@gmail.com>
+Ben Noordhuis <ben@strongloop.com>
+Benedek Heilig <benecene@gmail.com>
+Benjamin Dupont <bedupont@cisco.com>
+Benjamin Jemlich <pcgod99@gmail.com>
+Bernard Cafarelli <voyageur@gentoo.org>
+Bernhard M. Wiedemann <bwiedemann@suse.de>
+Bert Belder <bertbelder@gmail.com>
+Bhagirathi Satpathy <bhagirathi.s@samsung.com>
+Bhanukrushana Rout <b.rout@samsung.com>
+Biljith Jayan <billy.jayan@samsung.com>
+Boaz Sender <boaz@bocoup.com>
+Bobby Powers <bobbypowers@gmail.com>
+Branden Archer <bma4@zips.uakron.edu>
+Brendan Kirby <brendan.kirby@imgtec.com>
+Brendan Long <self@brendanlong.com>
+Brian G. Merrell <bgmerrell@gmail.com>
+Brian Konzman, SJ <b.g.konzman@gmail.com>
+Brian Luft <brian@electroly.com>
+Brian Merrell, Novell Inc. <bgmerrell@gmail.com>
+Brian Yip <itsbriany@gmail.com>
+Bruno Calvignac <bruno@flock.com>
+Bruno de Oliveira Abinader <brunoabinader@gmail.com>
+Bruno Roy <brusi_roy@hotmail.com>
+Bryan Donlan <bdonlan@gmail.com>
+Bryce Thomas <bryct@amazon.com>
+Burton <burton@typewritten.net>
+Byounghoon Yoon <bill.2714@kakaocorp.com>
+Byoungkwon Ko <codeimpl@gmail.com>
+Byungwoo Lee <bw80.lee@samsung.com>
+Caesar Wang <wxt@rock-chips.com>
+Caio Marcelo de Oliveira Filho <caio.de.oliveira.filho@intel.com>
+Caitlin Potter <caitpotter88@gmail.com>
+Calvin Mei <calvimei@amazon.com>
+Cameron Gutman <aicommander@gmail.com>
+Catalin Badea <badea@adobe.com>
+Cathie Chen <cathiechen@tencent.com>
+Cem Kocagil <cem.kocagil@gmail.com>
+Chakshu Ahuja <chakshu.a@samsung.com>
+Chamal De Silva <chamalsl@yahoo.com>
+Chandan Padhi <c.padhi@samsung.com>
+Chandra Shekar Vallala <brk376@motorola.com>
+Chandramouli Sanchi <cm.sanchi@samsung.com>
+Chang Shu <c.shu@samsung.com>
+Changbin Shao <changbin.shao@intel.com>
+Changjun Yang <changjun.yang@intel.com>
+ChangSeok Oh <shivamidow@gmail.com>
+Changwan Hong <changwan.hong@navercorp.com>
+Changyeon Kim <cyzero.kim@samsung.com>
+Chanho Park <parkch98@gmail.com>
+Chansik Yun <chansik.yun@gmail.com>
+Chaobin Zhang <zhchbin@gmail.com>
+Charles Vaughn <cvaughn@gmail.com>
+Choongwoo Han <cwhan.tunz@gmail.com>
+Chris Greene <cwgreene@amazon.com>
+Chris Harrelson <chrishtr@gmail.com>
+Chris Nardi <hichris123@gmail.com>
+Chris Szurgot <szurgotc@amazon.com>
+Chris Tserng <tserng@amazon.com>
+Chris Vasselli <clindsay@gmail.com>
+Christophe Dumez <ch.dumez@samsung.com>
+Christopher Dale <chrelad@gmail.com>
+Claudio DeSouza <claudiomdsjr@gmail.com>
+Clemens Fruhwirth <clemens@endorphin.org>
+Clement Scheelfeldt Skau <clementskau@gmail.com>
+Clinton Staley <clintstaley@gmail.com>
+Connor Pearson <cjp822@gmail.com>
+Craig Schlenter <craig.schlenter@gmail.com>
+Csaba Osztrogonác <ossy.szeged@gmail.com>
+Daegyu Lee <na7jun8gi@gmail.com>
+Dai Chunyang <chunyang.dai@intel.com>
+Daiwei Li <daiweili@suitabletech.com>
+Damien Marié <damien@dam.io>
+Dan McCombs <overridex@gmail.com>
+Daniel Bevenius <daniel.bevenius@gmail.com>
+Daniel Bomar <dbdaniel42@gmail.com>
+Daniel Carvalho Liedke <dliedke@gmail.com>
+Daniel Charles <daniel.charles@intel.com>
+Daniel Imms <daniimms@amazon.com>
+Daniel Johnson <danielj41@gmail.com>
+Daniel Lockyer <thisisdaniellockyer@gmail.com>
+Daniel Nishi <dhnishi@gmail.com>
+Daniel Platz <daplatz@googlemail.com>
+Daniel Shaulov <dshaulov@ptc.com>
+Daniel Trebbien <dtrebbien@gmail.com>
+Daniel Waxweiler <daniel.waxweiler@gmail.com>
+Dániel Bátyai <dbatyai@inf.u-szeged.hu>
+Dániel Vince <vinced@inf.u-szeged.hu>
+Darshini KN <kn.darshini@samsung.com>
+Dave Barker <kzar@kzar.co.uk>
+David Benjamin <davidben@mit.edu>
+David Davidovic <david@davidovic.io>
+David Erceg <erceg.david@gmail.com>
+David Fox <david@davidjfox.com>
+David Futcher <david.mike.futcher@gmail.com>
+David Leen <davileen@amazon.com>
+David McAllister <mcdavid@amazon.com>
+David Michael Barr <david.barr@samsung.com>
+David Spellman <dspell@amazon.com>
+David Valachovic <adenflorian@gmail.com>
+Dax Kelson <dkelson@gurulabs.com>
+Debashish Samantaray <d.samantaray@samsung.com>
+Debug Wang <debugwang@tencent.com>
+Deepak Dilip Borade <deepak.db@samsung.com>
+Deepak Mittal <deepak.m1@samsung.com>
+Deepak Sharma <deepak.sharma@amd.com>
+Deepak Singla <deepak.s@samsung.com>
+Deokjin Kim <deokjin81.kim@samsung.com>
+Derek Halman <d.halman@gmail.com>
+Devlin Cronin <rdevlin.cronin@gmail.com>
+Diana Suvorova <diana.suvorova@gmail.com>
+Diego Ferreiro Val <elfogris@gmail.com>
+Dillon Sellars <dill.sellars@gmail.com>
+Divya Bansal <divya.bansal@samsung.com>
+Dominic Farolino <domfarolino@gmail.com>
+Dominic Jodoin <dominic.jodoin@gmail.com>
+Dominik Röttsches <dominik.rottsches@intel.com>
+Don Woodward <woodward@adobe.com>
+Donghee Na <corona10@gmail.com>
+Dong-hee Na <donghee.na92@gmail.com>
+Dongie Agnir <dongie.agnir@gmail.com>
+Dongjun Kim <djmix.kim@samsung.com>
+Dongseong Hwang <dongseong.hwang@intel.com>
+Dongwoo Joshua Im <dw.im@samsung.com>
+Dongyu Lin <l2d4y3@gmail.com>
+Donna Wu <donna.wu@intel.com>
+Douglas F. Turner <doug.turner@gmail.com>
+Dustin Doloff <doloffd@amazon.com>
+Ebrahim Byagowi <ebrahim@gnu.org>
+Ebrahim Byagowi <ebraminio@gmail.com>
+Eden Wang <nedenwang@tencent.com>
+Eduardo Lima (Etrunko) <eblima@gmail.com>
+Eduardo Lima (Etrunko) <eduardo.lima@intel.com>
+Edward Baker <edward.baker@intel.com>
+Edward Crossman <tedoc2000@gmail.com>
+Eero Häkkinen <e.hakkinen@samsung.com>
+Eero Häkkinen <eero.hakkinen@intel.com>
+Egor Starkov <egor.starkov@samsung.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elan Ruusamäe <elan.ruusamae@gmail.com>
+Ergun Erdogmus <erdogmusergun@gmail.com>
+Eric Ahn <byungwook.ahn@gmail.com>
+Eric Rescorla <ekr@rtfm.com>
+Erik Hill <erikghill@gmail.com>
+Erik Sjölund <erik.sjolund@gmail.com>
+Eriq Augustine <eriq.augustine@gmail.com>
+Ernesto Mudu <ernesto.mudu@gmail.com>
+Etienne Laurin <etienne@atnnn.com>
+Eugene Kim <eugene70kim@gmail.com>
+Eugene Sudin <eugene@sudin.pro>
+Eunseok Oh <fivesilverstone@gmail.com>
+Evan Peterson <evan.peterson.ep@gmail.com>
+Evan Wallace <evan.exe@gmail.com>
+Evangelos Foutras <evangelos@foutrelis.com>
+Evgeniy Dushistov <dushistov@gmail.com>
+Evgeny Agafonchikov <evgeny.agafonchikov@akvelon.com>
+Fabien Tassin <fta@sofaraway.org>
+Felix H. Dahlke <fhd@ubercode.de>
+Fengrong Fang <fr.fang@samsung.com>
+Fernando Jiménez Moreno <ferjmoreno@gmail.com>
+Finbar Crago <finbar.crago@gmail.com>
+François Beaufort <beaufort.francois@gmail.com>
+Francois Kritzinger <francoisk777@gmail.com>
+Francois Rauch <leopardb@gmail.com>
+Frankie Dintino <fdintino@theatlantic.com>
+Franklin Ta <fta2012@gmail.com>
+Frédéric Jacob <frederic.jacob.78@gmail.com>
+Frédéric Wang <fred.wang@free.fr>
+Fu Junwei <junwei.fu@intel.com>
+Gabor Rapcsanyi <g.rapcsanyi@samsung.com>
+Gaetano Mendola <mendola@gmail.com>
+Gajendra N <gajendra.n@samsung.com>
+Gajendra Singh <wxjg68@motorola.com>
+Ganesh Borle <ganesh.borle@samsung.com>
+Gao Chun <chun.gao@intel.com>
+Gao Chun <gaochun.dev@gmail.com>
+Gaurav Dhol <gaurav.dhol@einfochips.com>
+Gautham Banasandra <gautham.bangalore@gmail.com>
+George Adams <geoada@amazon.com>
+George Joseph <kottackal.george@gmail.com>
+George Liaskos <geo.liaskos@gmail.com>
+Georgy Buranov <gburanov@gmail.com>
+Gergely Nagy <ngg@ngg.hu>
+Getulio Sánchez <valentin2507@gmail.com>
+Gideon Pyzer <gjpyzer@gmail.com>
+Giovanni Panaro <tsrwebgl@gmail.com>
+Girish Kumar M <mck.giri@samsung.com>
+Gitanshu Mehndiratta <g.mehndiratt@samsung.com>
+Giuseppe Iuculano <giuseppe@iuculano.it>
+Gnanasekar Somanathan <gnanasekar.s@samsung.com>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Goutham Jagannatha <wrm364@motorola.com>
+Graham Yoakum <gyoakum@skobalt.com>
+Greg Visser <gregvis@gmail.com>
+Gregory Davis <gpdavis.chromium@gmail.com>
+Grzegorz Czajkowski <g.czajkowski@samsung.com>
+Guangzhen Li <guangzhen.li@intel.com>
+Gurpreet Kaur <k.gurpreet@samsung.com>
+Gustav Tiger <gustav.tiger@sonymobile.com>
+Gyuyoung Kim <gyuyoung.kim@navercorp.com>
+Gzob Qq <gzobqq@gmail.com>
+Habib Virji <habib.virji@samsung.com>
+Haeun Kim <ggrace.kim93@gmail.com>
+Haeun Kim <haeungun@gmail.com>
+Haitao Feng <haitao.feng@intel.com>
+Halley Zhao <halley.zhao@intel.com>
+Halton Huo <halton.huo@gmail.com>
+Halton Huo <halton.huo@intel.com>
+Hans Hillen <hans.hillen@gmail.com>
+Hao Li <hao.x.li@intel.com>
+Haojian Wu <hokein.wu@gmail.com>
+Hari Singh <hari.singh1@samsung.com>
+Harpreet Singh Khurana <harpreet.sk@samsung.com>
+Harshikesh Kumar <harshikeshnobug@gmail.com>
+Hassan Salehe Matar <hassansalehe@gmail.com>
+Hautio Kari <khautio@gmail.com>
+Heejin R. Chung <heejin.r.chung@samsung.com>
+Heeyoun Lee <heeyoun.lee@samsung.com>
+Henrique Limas <henrique.ramos.limas@gmail.com>
+Himanshu Joshi <h.joshi@samsung.com>
+Holger Kraus <kraush@amazon.com>
+Hong Zheng <hong.zheng@intel.com>
+Hongbo Min <hongbo.min@intel.com>
+Horia Olaru <horia.olaru@gmail.com>
+Horia Olaru <olaru@adobe.com>
+Hosung You <hosung.you@samsung.com>
+Huapeng Li <huapengl@amazon.com>
+Huayong Xu <huayong.xu@samsung.com>
+Hugo Holgersson <hugo.holgersson@sonymobile.com>
+Hui Wang <wanghui07050707@gmail.com>
+Huiwon Jo <jhwon0415@gmail.com>
+Huy Duong <huy.duongdinh@gmail.com>
+Hwanseung Lee <hs1217.lee@gmail.com>
+Hwanseung Lee <hs1217.lee@samsung.com>
+Hyemi Shin <hyemi.sin@samsung.com>
+HyeockJin Kim <kherootz@gmail.com>
+Hyungchan Kim <inlinechan@gmail.com>
+Hyungwook Lee <hyungwook.lee@navercorp.com>
+Hyungwook Lee <withlhw@gmail.com>
+Hyunjun Shin <hyunjun.shin2@navercorp.com>
+Hyunjune Kim <hyunjune.kim@samsung.com>
+Hyunki Baik <hyunki.baik@samsung.com>
+Ian Cullinan <cullinan@amazon.com>
+Ian Scott <ian.scott@arteris.com>
+Ibrar Ahmed <ibrar.ahmad@gmail.com>
+Ilia Demianenko <ilia.demianenko@gmail.com>
+Ilia K <ki.stfu@gmail.com>
+Ilya Konstantinov <ilya.konstantinov@gmail.com>
+Imranur Rahman <ir.shimul@gmail.com>
+Ion Rosca <rosca@adobe.com>
+Irmak Kavasoglu <irmakkavasoglu@gmail.com>
+Isaac Murchie <murchieisaac@gmail.com>
+Isaac Reilly <reillyi@amazon.com>
+Ivan Naydonov <samogot@gmail.com>
+Ivan Sham <ivansham@amazon.com>
+Jack Bates <jack@nottheoilrig.com>
+Jacob Clark <jacob.jh.clark@googlemail.com>
+Jacob Mandelson <jacob@mandelson.org>
+Jaehun Lim <ljaehun.lim@samsung.com>
+Jaehyun Lee <j-hyun.lee@samsung.com>
+Jaekyeom Kim <btapiz@gmail.com>
+Jaemin Seo <jaemin86.seo@samsung.com>
+Jaeseok Yoon <yjaeseok@gmail.com>
+Jaewon Choi <jaewon.james.choi@gmail.com>
+Jaeyong Bae <jdragon.bae@gmail.com>
+Jaime Soriano Pastor <jsorianopastor@gmail.com>
+Jake Helfert <jake@helfert.us>
+Jake Hendy <me@jakehendy.com>
+Jakob Weigert <jakob.j.w@googlemail.com>
+Jakub Machacek <xtreit@gmail.com>
+James Burton <jb@0.me.uk>
+James Choi <jchoi42@pha.jhu.edu>
+James Stanley <james@apphaus.co.uk>
+James Vega <vega.james@gmail.com>
+James Wei <james.wei@intel.com>
+James Willcox <jwillcox@litl.com>
+Jan Rucka <ruckajan10@gmail.com>
+Jan Sauer <jan@jansauer.de>
+Janwar Dinata <j.dinata@gmail.com>
+Jared Shumway <jaredshumway94@gmail.com>
+Jared Sohn <jared.sohn@gmail.com>
+Jared Wein <weinjared@gmail.com>
+Jari Karppanen <jkarp@amazon.com>
+Jay Oster <jay@kodewerx.org>
+Jay Soffian <jaysoffian@gmail.com>
+Jeado Ko <haibane84@gmail.com>
+Jeffrey C <jeffreyca16@gmail.com>
+Jeongeun Kim <je_julie.kim@samsung.com>
+Jeongmin Kim <kimwjdalsl@gmail.com>
+Jeongwoo Park <skeksk91@gmail.com>
+Jeremy Noring <jnoring@hirevue.com>
+Jeremy Spiegel <jeremysspiegel@gmail.com>
+Jeroen Van den Berghe <vandenberghe.jeroen@gmail.com>
+Jerry Lin <wahahab11@gmail.com>
+Jesper Storm Bache <jsbache@gmail.com>
+Jesse Miller <jesse@jmiller.biz>
+Jesus Sanchez-Palencia <jesus.sanchez-palencia.fernandez.fil@intel.com>
+Jiadong Zhu <jiadong.zhu@linaro.org>
+Jiajia Qin <jiajia.qin@intel.com>
+Jiajie Hu <jiajie.hu@intel.com>
+Jianjun Zhu <jianjun.zhu@intel.com>
+Jianneng Zhong <muzuiget@gmail.com>
+Jiawei Shao <jiawei.shao@intel.com>
+Jie Chen <jie.a.chen@intel.com>
+Jihoon Chung <j.c@navercorp.com>
+Jihoon Chung <jihoon@gmail.com>
+Jihun Brent Kim <devgrapher@gmail.com>
+Jihwan Marc Kim <bluewhale.marc@gmail.com>
+Jin Yang <jin.a.yang@intel.com>
+Jincheol Jo <jincheol.jo@navercorp.com>
+Jinfeng Ma <majinfeng1@xiaomi.com>
+Jing Zhao <zhaojing7@xiaomi.com>
+Jinglong Zuo <zuojinglong@xiaomi.com>
+Jingwei Liu <kingweiliu@gmail.com>
+Jingyi Wei <wjywbs@gmail.com>
+Jinho Bang <jinho.bang@samsung.com>
+Jinsong Fan <fanjinsong@sogou-inc.com>
+Jinwoo Song <jinwoo7.song@samsung.com>
+Jinyoung Hur <hurims@gmail.com>
+Jitendra Kumar Sahoo <jitendra.ks@samsung.com>
+Joachim Bauch <jbauch@webrtc.org>
+Joachim Bauch <mail@joachim-bauch.de>
+Joanmarie Diggs <joanmarie.diggs@gmail.com>
+Joe Knoll <joe.knoll@workday.com>
+Joe Thomas <mhx348@motorola.com>
+Joel Stanley <joel@jms.id.au>
+Johannes Rudolph <johannes.rudolph@googlemail.com>
+John Kleinschmidt <kleinschmidtorama@gmail.com>
+John Yani <vanuan@gmail.com>
+John Yoo <nearbyh13@gmail.com>
+Johnson Lin <johnson.lin@intel.com>
+Jonathan Frazer <listedegarde@gmail.com>
+Jonathan Garbee <jonathan@garbee.me>
+Jonathan Hacker <jhacker@arcanefour.com>
+Jongdeok Kim <jongdeok.kim@navercorp.com>
+Jongheon Kim <sapzape@gmail.com>
+JongKwon Lee <jongkwon.lee@navercorp.com>
+Jongsoo Lee <leejongsoo@gmail.com>
+Joone Hur <joone.hur@intel.com>
+Joonghun Park <pjh0718@gmail.com>
+Jorge Villatoro <jorge@tomatocannon.com>
+Joseph Gentle <josephg@gmail.com>
+Joseph Lolak <joseph.lolak@samsung.com>
+Josh Triplett <josh.triplett@intel.com>
+Josh Triplett <josh@joshtriplett.org>
+Joshua Lock <joshua.lock@intel.com>
+Joshua Roesslein <jroesslein@gmail.com>
+Josué Ratelle <jorat1346@gmail.com>
+Josyula Venkat Narasimham <venkat.nj@samsung.com>
+Juan Jose Lopez Jaimez <jj.lopezjaimez@gmail.com>
+Juhui Lee <juhui24.lee@samsung.com>
+Julien Brianceau <jbriance@cisco.com>
+Julien Isorce <j.isorce@samsung.com>
+Julien Racle <jracle@logitech.com>
+Jun Fang <jun_fang@foxitsoftware.com>
+Jun Jiang <jun.a.jiang@intel.com>
+Junchao Han <junchao.han@intel.com>
+Junghoon Lee <sjh836@gmail.com>
+Junghyuk Yoo <wjdgurdl272@gmail.com>
+JungJik Lee <jungjik.lee@samsung.com>
+Jungkee Song <jungkee.song@samsung.com>
+Junmin Zhu <junmin.zhu@intel.com>
+Justin Okamoto <justmoto@amazon.com>
+Justin Ribeiro <justin@justinribeiro.com>
+Jüri Valdmann <juri.valdmann@qt.io>
+Kai Jiang <jiangkai@gmail.com>
+Kai Köhne <kai.koehne@qt.io>
+Kai Uwe Broulik <kde@privat.broulik.de>
+Kal Conley <kcconley@gmail.com>
+Kalyan Kondapally <kalyan.kondapally@intel.com>
+Kamil Jiwa <kamil.jiwa@gmail.com>
+Kamil Rytarowski <krytarowski@gmail.com>
+Kangil Han <kangil.han@samsung.com>
+Kangyuan Shu <kangyuan.shu@intel.com>
+Karan Thakkar <karanjthakkar@gmail.com>
+Kartikey Bhatt <kartikey@amazon.com>
+Kaspar Brand <googlecontrib@velox.ch>
+Kaustubh Atrawalkar <kaustubh.a@samsung.com>
+Kaustubh Atrawalkar <kaustubh.ra@gmail.com>
+Ke He <ke.he@intel.com>
+Keene Pan <keenepan@linpus.com>
+Keita Yoshimoto <y073k3@gmail.com>
+Keith Chen <keitchen@amazon.com>
+Kenneth Rohde Christiansen <kenneth.r.christiansen@intel.com>
+Kenneth Strickland <ken.strickland@gmail.com>
+Kenneth Zhou <knthzh@gmail.com>
+Keonho Kim <keonho07.kim@samsung.com>
+Ketan Goyal <ketan.goyal@samsung.com>
+Kevin Gibbons <bakkot@gmail.com>
+Kevin Lee Helpingstine <sig11@reprehensible.net>
+Kevin M. McCormick <mckev@amazon.com>
+Khasim Syed Mohammed <khasim.mohammed@linaro.org>
+Kihong Kwon <kihong.kwon@samsung.com>
+Kihoon Ko <rhrlgns777@gmail.com>
+Kihwang Kim <pwangkk@gmail.com>
+Kim Christensen <kimworking@gmail.com>
+Kimberly Hunter <kimberhu@amazon.com>
+Kingshuk Jana <kingshuk.j@samsung.com>
+Kirill Bobyrev <kirillbobyrev@gmail.com>
+Kirill Ovchinnikov <kirill.ovchinn@gmail.com>
+Klemen Forstnerič <klemen.forstneric@gmail.com>
+Kodam Nagaraju <k2.nagaraju@samsung.com>
+Konrad Dzwinel <kdzwinel@gmail.com>
+Krishna Chaitanya <krish.botta@samsung.com>
+Kristof Kosztyo <kkosztyo.u-szeged@partner.samsung.com>
+Krzysztof Czech <k.czech@samsung.com>
+Krzysztof Wolanski <k.wolanski@samsung.com>
+Kui Tan <tk1061178@gmail.com>
+Kunal Thakar <kunalt@gmail.com>
+Kushal Pisavadia <kushi.p@gmail.com>
+Kwangho Shin <k_h.shin@samsung.com>
+Kyle Nahrgang <kpn24@drexel.edu>
+Kyle Plumadore <kyle.plumadore@amd.com>
+Kyounga Ra <kyounga.ra@gmail.com>
+Kyoungdeok Kwon <kkd927@gmail.com>
+Kyung Yeol Kim <chitacan@gmail.com>
+Kyungtae Kim <ktf.kim@samsung.com>
+Kyungyoung Heo <bbvch13531@gmail.com>
+Lalit Chandivade <lalit.chandivade@einfochips.com>
+Laszlo Gombos <l.gombos@samsung.com>
+Laszlo Radanyi <bekkra@gmail.com>
+Lauren Yeun Kim <lauren.yeun.kim@gmail.com>
+Lauri Oherd <lauri.oherd@gmail.com>
+Lavar Askew <open.hyperion@gmail.com>
+Legend Lee <guanxian.li@intel.com>
+Leith Bade <leith@leithalweapon.geek.nz>
+Lenny Khazan <lenny.khazan@gmail.com>
+Leo Wolf <jclw@ymail.com>
+Leon Han <leon.han@intel.com>
+Leung Wing Chung <lwchkg@gmail.com>
+Li Yin <li.yin@intel.com>
+Lidwine Genevet <lgenevet@cisco.com>
+Lin Sun <lin.sun@intel.com>
+Lingyun Cai <lingyun.cai@intel.com>
+Lionel Landwerlin <lionel.g.landwerlin@intel.com>
+Lizhi Fan <lizhi.fan@samsung.com>
+Loo Rong Jie <loorongjie@gmail.com>
+Lorenzo Stoakes <lstoakes@gmail.com>
+Lu Guanqun <guanqun.lu@gmail.com>
+Luca Di Domenico <luca94dd@gmail.com>
+Lucie Brozkova <lucinka.brozkova@gmail.com>
+Luiz Von Dentz <luiz.von.dentz@intel.com>
+Luka Dojcilovic <l.dojcilovic@gmail.com>
+Luke Inman-Semerau <luke.semerau@gmail.com>
+Luke Zarko <lukezarko@gmail.com>
+Luoxi Pan <l.panpax@gmail.com>
+Maarten Lankhorst <m.b.lankhorst@gmail.com>
+Magnus Danielsson <fuzzac@gmail.com>
+Mahesh Kulkarni <mahesh.kk@samsung.com>
+Mahesh Machavolu <mahesh.ma@samsung.com>
+Maksim Kolesin <mkolesin@gmail.com>
+Maksim Sisov <maksim.sisov@intel.com>
+Malcolm Wang <malcolm.2.wang@gmail.com>
+Mallikarjuna Rao V <vm.arjun@samsung.com>
+Manish Chhajer <chhajer.m@samsung.com>
+Manish Jethani <m.jethani@eyeo.com>
+Manojkumar Bhosale <manojkumar.bhosale@imgtec.com>
+Manuel Braun <thembrown@gmail.com>
+Mao Yujie <maojie0924@gmail.com>
+Mao Yujie <yujie.mao@intel.com>
+Marc des Garets <marc.desgarets@googlemail.com>
+Marcin Wiacek <marcin@mwiacek.com>
+Marco Rodrigues <gothicx@gmail.com>
+Mario Pistrich <m.pistrich@gmail.com>
+Mario Sanchez Prada <mario.prada@samsung.com>
+Mariusz Mlynski <marius.mlynski@gmail.com>
+Mark Hahnenberg <mhahnenb@andrew.cmu.edu>
+Mark Seaborn <mrs@mythic-beasts.com>
+Martijn Croonen <martijn@martijnc.be>
+Martin Bednorz <m.s.bednorz@gmail.com>
+Martin Rogalla <martin@martinrogalla.com>
+Martina Kollarova <martina.kollarova@intel.com>
+Masahiro Yado <yado.masa@gmail.com>
+Masaru Nishida <msr.i386@gmail.com>
+Matej Knopp <matej.knopp@gmail.com>
+Matheus Bratfisch <matheusbrat@gmail.com>
+Mathias Bynens <mathias@qiwi.be>
+Mathieu Meisser <mmeisser@logitech.com>
+Matt Arpidone <mma.public@gmail.com>
+Matt Strum <mstrum@amazon.com>
+Matt Zeunert <matt@mostlystatic.com>
+Matthew Bauer <mjbauer95@gmail.com>
+Matthew Demarest <demarem@amazon.com>
+Matthew Robertson <matthewrobertson03@gmail.com>
+Matthew Turk <matthewturk@gmail.com>
+Matthew Willis <appamatto@gmail.com>
+Matthias Reitinger <reimarvin@gmail.com>
+Matthieu Rigolot <matthieu.rigolot@gmail.com>
+Max Perepelitsyn <pph34r@gmail.com>
+Max Vujovic <mvujovic@adobe.com>
+Mayank Gupta <mayank.g1@samsung.com>
+Mayur Kankanwadi <mayurk.vk@samsung.com>
+Md Abdullah Al Alamin <a.alamin.cse@gmail.com>
+Md. Hasanur Rashid <hasanur.r@samsung.com>
+Md Jobed Hossain <jrony15@gmail.com>
+Md Sami Uddin <md.sami@samsung.com>
+Michael Cirone <mikecirone@gmail.com>
+Michael Gilbert <floppymaster@gmail.com>
+Michael Lopez <lopes92290@gmail.com>
+Michael Morrison <codebythepound@gmail.com>
+Michael Müller <michael@fds-team.de>
+Michael Schechter <mike.schechter@gmail.com>
+Michaël Zasso <mic.besace@gmail.com>
+Michael Zugelder <michael@zugelder.org>
+Michel Promonet <michel.promonet.1@gmail.com>
+Mihai Maerean <mmaerean@adobe.com>
+Mihai Tica <mihai.o.tica@gmail.com>
+Mihai Tica <mitica@adobe.com>
+Mike Pennisi <mike@mikepennisi.com>
+Mike Tilburg <mtilburg@adobe.com>
+Mikhail Pozdnyakov <mikhail.pozdnyakov@intel.com>
+Milko Leporis <milko.leporis@imgtec.com>
+Milton Chiang <milton.chiang@mediatek.com>
+Minggang Wang <minggang.wang@intel.com>
+Mingmin Xie <melvinxie@gmail.com>
+Minjeong Lee <apenr1234@gmail.com>
+Minseok Koo <kei98301@gmail.com>
+Minsoo Max Koo <msu.koo@samsung.com>
+Miran Karic <miran.karic@imgtec.com>
+Mirela Budaes <mbudaes@adobe.com>
+Mirela Budaes <mbudaes@gmail.com>
+Miyoung Shin <myid.shin@navercorp.com>
+Mohamed I. Hammad <ibraaaa@gmail.com>
+Mohamed Mansour <m0.interactive@gmail.com>
+Mohammad Azam <m.azam@samsung.com>
+Mohammed Wajahat Ali Siddiqui <wajahat.s@samsung.com>
+Mohan Reddy <mohan.reddy@samsung.com>
+Mohit Bhalla <bhallam@amazon.com>
+Momoko Hattori <momohatt10@gmail.com>
+Mostafa Sedaghat joo <mostafa.sedaghat@gmail.com>
+Mrunal Kapade <mrunal.kapade@intel.com>
+Myeongjin Cho <myeongjin.cho@navercorp.com>
+Myles C. Maxfield <mymax@amazon.com>
+Myung-jong Kim <mjkim610@gmail.com>
+Nagarajan Narayanan <nagarajan.n@samsung.com>
+Nagarjuna Atluri <nagarjuna.a@samsung.com>
+Naiem Shaik <naiem.shaik@gmail.com>
+Naoki Takano <takano.naoki@gmail.com>
+Naveen Bobbili <naveenbobbili@motorola.com>
+Naveen Bobbili <qghc36@motorola.com>
+Naveen Kumar Devaraj <devarajn@amazon.com>
+Naveen Kumar S G <naveensg@samsung.com>
+Nayan Kumar K <qtc746@motorola.com>
+Neal Gompa <ngompa13@gmail.com>
+Ned Williamson <nedwilliamson@gmail.com>
+Nedeljko Babic <nedeljko.babic@imgtec.com>
+Nikhil Bansal <n.bansal@samsung.com>
+Nikhil Sahni <nikhil.sahni@samsung.com>
+Nikita Ofitserov <himikof@gmail.com>
+Niklas Hambüchen <mail@nh2.me>
+Niklas Schulze <me@jns.io>
+Nikola Kovacs <nikola.kovacs@gmail.com>
+Nils Schneider <nils.schneider@gmail.com>
+Nils Schneider <nils@nilsschneider.net>
+Ningxin Hu <ningxin.hu@intel.com>
+Nitish Mehrotra <nitish.m@samsung.com>
+Noj Vek <nojvek@gmail.com>
+Nolan Cao <nolan.robin.cao@gmail.com>
+Oleksii Kadurin <ovkadurin@gmail.com>
+Oliver Dunk <oliver@oliverdunk.com>
+Olli Raula (Old name Olli Syrjälä) <olli.raula@intel.com>
+Omar Sandoval <osandov@osandov.com>
+Pan Deng <pan.deng@intel.com>
+Parag Radke <nrqv63@motorola.com>
+Paritosh Kumar <paritosh.in@samsung.com>
+Patrasciuc Sorin Cristian <cristian.patrasciuc@gmail.com>
+Patrick Chan <chanpatorikku@gmail.com>
+Patrick Kettner <patrickkettner@gmail.com>
+Patrick Riordan <patrickriordan177@gmail.com>
+Patrick Stein <patrickwonders@gmail.com>
+Patrik Ackland <patrikackland@gmail.com>
+Paul Adolph <padolph@netflix.com>
+Paul Kehrer <paul.l.kehrer@gmail.com>
+Paul Lind <paul.lind@imgtec.com>
+Paul Nettleship <pnettleship@gmail.com>
+Paul Robinson <paulrobinson85@googlemail.com>
+Paul Roskell <blurrech@gmail.com>
+Paul Sapunaru <paul.sapunaru@intel.com>
+Paul Wicks <pwicks86@gmail.com>
+Pavan Kumar Emani <pavan.e@samsung.com>
+Pavel Golikov <paullo612@ya.ru>
+Pavel Ivanov <paivanof@gmail.com>
+Pawel Forysiuk <p.forysiuk@samsung.com>
+Paweł Hajdan jr <phajdan.jr@gmail.com>
+Payal Pandey <payal.pandey@samsung.com>
+Peng Hu <penghu@tencent.com>
+Peng Jiang <leiyi.jp@gmail.com>
+Peng Xinchao <pxinchao@gmail.com>
+Peter Bright <drpizza@quiscalusmexicanus.org>
+Peter Brophy <pbrophy@adobe.com>
+Peter Collingbourne <peter@pcc.me.uk>
+Peter Gal <pgal.u-szeged@partner.samsung.com>
+Peter Griffin <peter.griffin@linaro.org>
+Peter Molnar <pmolnar.u-szeged@partner.samsung.com>
+Peter Snyder <snyderp@gmail.com>
+Peter Wong <peter.wm.wong@gmail.com>
+Philip Hanson <philip.hanson@intel.com>
+Philipp Hancke <fippo@andyet.net>
+Philipp Hancke <philipp.hancke@googlemail.com>
+Philippe Beauchamp <philippe.beauchamp@gmail.com>
+Philippe Beaudoin <philippe.beaudoin@gmail.com>
+PhistucK <phistuck@gmail.com>
+Pierre Neter <pierreneter@gmail.com>
+Pierre-Antoine LaFayette <pierre.lafayette@gmail.com>
+Po-Chun Chang <pochang0403@gmail.com>
+Pramod Begur Srinath <pramod.bs@samsung.com>
+Pranay Kumar <pranay.kumar@samsung.com>
+Pranjal Jumde <pranjal@brave.com>
+Prashant Hiremath <prashhir@cisco.com>
+Prashant Nevase <prashant.n@samsung.com>
+Prashant Patil <prashant.patil@imgtec.com>
+Praveen Akkiraju <praveen.anp@samsung.com>
+Preeti Nayak <preeti.nayak@samsung.com>
+Pritam Nikam <pritam.nikam@samsung.com>
+Puttaraju R <puttaraju.r@samsung.com>
+Qi Yang <qi1988.yang@samsung.com>
+Qiankun Miao <qiankun.miao@intel.com>
+Qing Zhang <qing.zhang@intel.com>
+Radu Stavila <stavila@adobe.com>
+Radu Velea <radu.velea@intel.com>
+Rafael Antognolli <rafael.antognolli@intel.com>
+Raghavendra Ghatage <r.ghatage@samsung.com>
+Raghu Ram Nagaraj <r.nagaraj@samsung.com>
+Rahul Gupta <rahul.g@samsung.com>
+Rajneesh Rana <rajneesh.r@samsung.com>
+Raman Tenneti <raman.tenneti@gmail.com>
+Ramkumar Gokarnesan <ramkumar.gokarnesan@gmail.com>
+Ramkumar Ramachandra <artagnon@gmail.com>
+Ramya Vadlamudi <ramya.v@samsung.com>
+Randy Posynick <randy.posynick@gmail.com>
+Raphael Kubo da Costa <raphael.kubo.da.costa@intel.com>
+Raul Tambre <raul@tambre.ee>
+Raveendra Karu <r.karu@samsung.com>
+Ravi Nanjundappa <nravi.n@samsung.com>
+Ravi Phaneendra Kasibhatla <r.kasibhatla@samsung.com>
+Ravi Phaneendra Kasibhatla <ravi.kasibhatla@motorola.com>
+Raviraj Sitaram <raviraj.p.sitaram@intel.com>
+Réda Housni Alaoui <alaoui.rda@gmail.com>
+Refael Ackermann <refack@gmail.com>
+Renata Hodovan <rhodovan.u-szeged@partner.samsung.com>
+Rene Bolldorf <rb@radix.io>
+Rene Ladan <r.c.ladan@gmail.com>
+Richard Baranyi <lordprotector@gmail.com>
+Richard Li <richard.li@intel.com>
+Rijubrata Bhaumik <rijubrata.bhaumik@intel.com>
+Riku Voipio <riku.voipio@linaro.org>
+Rob Buis <rob.buis@samsung.com>
+Rob Wu <rob@robwu.nl>
+Robert Bear Travis <bear.travis@gmail.com>
+Robert Bear Travis <betravis@adobe.com>
+Robert Bradford <robert.bradford@intel.com>
+Robert Goldberg <goldberg@adobe.com>
+Robert Hogan <robhogan@gmail.com>
+Robert Nagy <robert.nagy@gmail.com>
+Robert Sesek <rsesek@bluestatic.org>
+Roland Takacs <rtakacs.u-szeged@partner.samsung.com>
+Romain Pokrzywka <romain.pokrzywka@gmail.com>
+Rosen Dash <nqk836@motorola.com>
+Rosen Dash <rosen.dash@gmail.com>
+Ross Kirsling <rkirsling@gmail.com>
+ruben <chromium@hybridsource.org>
+Ruben Bridgewater <ruben@bridgewater.de>
+Ruben Terrazas <rubentopo@gmail.com>
+Rufus Hamade <rufus.hamade@imgtec.com>
+Ruiyi Luo <luoruiyi2008@gmail.com>
+Ryan Ackley <ryanackley@gmail.com>
+Ryan Norton <rnorton10@gmail.com>
+Ryan Sleevi <ryan-chromium-dev@sleevi.com>
+Ryan Yoakum <ryoakum@skobalt.com>
+Ryuan Choi <ryuan.choi@samsung.com>
+Saikrishna Arcot <saiarcot895@gmail.com>
+Sajal Khandelwal <skhandelwa22@bloomberg.net>
+Salvatore Iovene <salvatore.iovene@intel.com>
+Sam Larison <qufighter@gmail.com>
+Sam McDonald <sam@sammcd.com>
+Samuel Attard <samuel.r.attard@gmail.com>
+Sanggi Hong <sanggi.hong11@gmail.com>
+Sanghee Lee <sanghee.lee1992@gmail.com>
+Sanghyun Park <sh919.park@samsung.com>
+Sanghyup Lee <sh53.lee@samsung.com>
+Sangjoon Je <htamop@gmail.com>
+Sangseok Jang <sangseok.jang@navercorp.com>
+Sangwoo Ko <sangwoo.ko@navercorp.com>
+Sangwoo Ko <sangwoo108@gmail.com>
+Sanjoy Pal <ncj674@motorola.com>
+Sanjoy Pal <sanjoy.pal@samsung.com>
+Sanne Wouda <sanne.wouda@gmail.com>
+Santosh Mahto <samahto@cisco.com>
+Sarath Singapati <s.singapati@gmail.com>
+Sarath Singapati <s.singapati@samsung.com>
+Sarath Singapati <sarath.singapati@huawei.com>
+Saravanan KR <sramajay@cisco.com>
+Sathish Kuppuswamy <sathish.kuppuswamy@intel.com>
+Satoshi Matsuzaki <satoshi.matsuzaki@gmail.com>
+Satyajit Sahu <satyajit.sahu@amd.com>
+Sayan Nayak <sayan.nayak@samsung.com>
+Scott D Phillips <scott.d.phillips@intel.com>
+Sean Bryant <sean@cyberwang.net>
+Sean DuBois <seaduboi@amazon.com>
+Sebastian Amend <sebastian.amend@googlemail.com>
+Sebastian Krzyszkowiak <dos@dosowisko.net>
+Seo Sanghyeon <sanxiyn@gmail.com>
+Seokju Kwon <seokju.kwon@gmail.com>
+SeongTae Jeong <ferendevelop.gl@gmail.com>
+Sergey Kipet <sergey.kipet@gmail.com>
+Sergey Putilin <p.sergey@samsung.com>
+Sergey Shekyan <shekyan@gmail.com>
+Sergio Carlos Morales Angeles <carloschilazo@gmail.com>
+Sergiy Belozorov <rryk.ua@gmail.com>
+Seshadri Mahalingam <seshadri.mahalingam@gmail.com>
+Seungkyu Lee <zx6658@gmail.com>
+Sevan Janiyan <venture37@geeklan.co.uk>
+Shahriar Rostami <shahriar.rostami@gmail.com>
+Shail Singhal <shail.s@samsung.com>
+Shane Hansen <shanemhansen@gmail.com>
+ShankarGanesh K <blr.bmlab@gmail.com>
+Shanmuga Pandi M <shanmuga.m@samsung.com>
+Shaobo Yan <shaobo.yan@intel.com>
+Shashi Kumar <sk.kumar@samsung.com>
+Shawn Anastasio <shawnanastasio@gmail.com>
+Shelley Vohr <shelley.vohr@gmail.com>
+Shen Yu <shenyu.tcv@gmail.com>
+Sherry Mou <wenjinm@amazon.com>
+Shez Baig <sbaig1@bloomberg.net>
+Shigeki Ohtsu <shigeki.ohtsu@gmail.com>
+Shiliu Wang <aofdwsl@gmail.com>
+Shiliu Wang <shiliu.wang@intel.com>
+Shilpa Shri <shilpa.shri@samsung.com>
+Shirish S <shirish.s@amd.com>
+Shiva Kumar <shiva.k1@samsung.com>
+Shivakumar JM <shiva.jm@samsung.com>
+Shouqun Liu <liushouqun@xiaomi.com>
+Shouqun Liu <shouqun.liu@intel.com>
+Shreeram Kushwaha <shreeram.k@samsung.com>
+Shreyas Gopal <shreyas.g@samsung.com>
+Shreyas VA <v.a.shreyas@gmail.com>
+Shubham Agrawal <shubag@amazon.com>
+Siba Samal <siba.samal@samsung.com>
+Siddharth Bagai <b.siddharth@samsung.com>
+Siddharth Shankar <funkysidd@gmail.com>
+Simon Arlott <simon.arlott@gmail.com>
+Simon La Macchia <smacchia@amazon.com>
+Siva Kumar Gunturi <siva.gunturi@samsung.com>
+Sohan Jyoti Ghosh <sohan.jyoti@huawei.com>
+Sohan Jyoti Ghosh <sohan.jyoti@samsung.com>
+Song YeWen <ffmpeg@gmail.com>
+Sooho Park <sooho1000@gmail.com>
+Soojung Choi <crystal2840@gmail.com>
+Soorya R <soorya.r@samsung.com>
+Soren Dreijer <dreijerbit@gmail.com>
+Sreerenj Balachandran <sreerenj.balachandran@intel.com>
+Srirama Chandra Sekhar Mogali <srirama.m@samsung.com>
+Staphany Park <stapark008@gmail.com>
+Stephen Searles <stephen.searles@gmail.com>
+Steve Sanders <steve@zanderz.com>
+Steven Pennington <spenn@engr.uvic.ca>
+Steven Roussey <sroussey@gmail.com>
+Subrahmanya Praveen Munukutla <sataya.m@samsung.com>
+Suchit Agrawal <a.suchit@samsung.com>
+Sudarsana Babu Nagineni <sudarsana.nagineni@intel.com>
+Sudarshan Parthasarathy <sudarshan.p@samsung.com>
+Sujae Jo <sujae33.jo@gmail.com>
+Sujith S S <sujiths.s@samsung.com>
+Sunchang Li <johnstonli@tencent.com>
+Suneel Kota <suneel.kota@samsung.com>
+Sungguk Lim <limasdf@gmail.com>
+Sungmann Cho <sungmann.cho@gmail.com>
+Sungmann Cho <sungmann.cho@navercorp.com>
+Sunil Ratnu <sunil.ratnu@samsung.com>
+Sunitha Srivatsa <srivats@amazon.com>
+Suvanjan Mukherjee <suvanjanmukherjee@gmail.com>
+Suyambulingam R M <suyambu.rm@samsung.com>
+Suyash Sengar <suyash.s@samsung.com>
+Swarali Raut <swarali.sr@samsung.com>
+Swati Jaiswal <swa.jaiswal@samsung.com>
+Sylvain Zimmer <sylvinus@gmail.com>
+Sylvestre Ledru <sylvestre.ledru@gmail.com>
+Synthia Islam <synthia.is@samsung.com>
+Szabolcs David <davidsz@inf.u-szeged.hu>
+Szymon Piechowicz <szymonpiechowicz@o2.pl>
+Taeheon Kim <skyrabbits1@gmail.com>
+Taehoon Lee <taylor.hoon@gmail.com>
+Takashi Fujita <tgfjt.mail@gmail.com>
+Takeshi Kurosawa <taken.spc@gmail.com>
+Tanay Chowdhury <tanay.c@samsung.com>
+Tanvir Rizvi <tanvir.rizvi@samsung.com>
+Tapu Kumar Ghose <ghose.tapu@gmail.com>
+Taylor Price <trprice@gmail.com>
+Ted Kim <neot0000@gmail.com>
+Ted Vessenes <tedvessenes@gmail.com>
+Teodora Novkovic <teodora.petrovic@gmail.com>
+Thiago Farina <thiago.farina@gmail.com>
+Thiago Marcos P. Santos <thiago.santos@intel.com>
+Thomas Butter <tbutter@gmail.com>
+Thomas Conti <tomc@amazon.com>
+Thomas White <im.toms.inbox@gmail.com>
+Tiago Vignatti <tiago.vignatti@intel.com>
+Tibor Dusnoki <tibor.dusnoki.91@gmail.com>
+Tim Ansell <mithro@mithis.com>
+Tim Niederhausen <tim@rnc-ag.de>
+Timo Gurr <timo.gurr@gmail.com>
+Timo Reimann <ttr314@googlemail.com>
+Timo Witte <timo.witte@gmail.com>
+Ting Shao <ting.shao@intel.com>
+Tom Callaway <tcallawa@redhat.com>
+Tom Harwood <tfh@skip.org>
+Tomas Popela <tomas.popela@gmail.com>
+Torsten Kurbad <google@tk-webart.de>
+Trent Willis <trentmwillis@gmail.com>
+Trevor Perrin <unsafe@trevp.net>
+Tripta Gupta <tripta.g@samsung.com>
+U. Artie Eoff <ullysses.a.eoff@intel.com>
+Umar Hansa <umar.hansa@gmail.com>
+Upendra Gowda <upendrag.gowda@gmail.com>
+Uzair Jaleel <uzair.jaleel@samsung.com>
+Vadim Gorbachev <bmsdave@gmail.com>
+Vaibhav Agrawal <vaibhav1.a@samsung.com>
+Valentin Ilie <valentin.ilie@intel.com>
+Vamshikrishna Yellenki <vamshi@motorola.com>
+Vani Hegde <vani.hegde@samsung.com>
+Varun Chowdhary Paturi <v.paturi@samsung.com>
+Vartul Katiyar <vartul.k@samsung.com>
+Vedran Šajatović <vedran.sajatovic@gmail.com>
+Vernon Tang <vt@foilhead.net>
+Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
+Victor Costan <costan@gmail.com>
+Viet-Trung Luu <viettrungluu@gmail.com>
+Vinay Anantharaman <vinaya@adobe.com>
+Vipul Bhasin <vipul.bhasin@gmail.com>
+Visa Putkinen <v.putkinen@partner.samsung.com>
+Vishal Bhatnagar <vishal.b@samsung.com>
+Vitaliy Kharin <kvserr@gmail.com>
+Vivek Galatage <vivek.vg@samsung.com>
+Volker Sorge <volker.sorge@gmail.com>
+Waihung Fu <fufranci@amazon.com>
+Wanming Lin <wanming.lin@intel.com>
+Wei Li <wei.c.li@intel.com>
+WenSheng He <wensheng.he@samsung.com>
+Wesley Lancel <wesleylancel@gmail.com>
+Wesley Wigham <wwigham@gmail.com>
+Will Hirsch <chromium@willhirsch.co.uk>
+Will Shackleton <w.shackleton@gmail.com>
+William Xie <william.xie@intel.com>
+Xiang Long <xiang.long@intel.com>
+Xiangze Zhang <xiangze.zhang@intel.com>
+Xiaofeng Zhang <xiaofeng.zhang@intel.com>
+Xiaolei Yu <dreifachstein@gmail.com>
+Xiaoshu Zhang <xiaoshu@amazon.com>
+Xiaoyin Liu <xiaoyin.l@outlook.com>
+Xinchao He <hexinchao@gmail.com>
+Xing Zhang <xzhang@adobe.com>
+Xinghua Cao <xinghua.cao@intel.com>
+Xu Samuel <samuel.xu@intel.com>
+Xu Xing <xing.xu@intel.com>
+Xuefei Ren <xrenishere@gmail.com>
+Xueqing Huang <huangxueqing@xiaomi.com>
+Xun Sun <xun.sun@intel.com>
+Xunran Ding <xunran.ding@samsung.com>
+Xunran Ding <dingxunran@gmail.com>
+Yael Aharon <yael.aharon@intel.com>
+Yan Wang <yan0422.wang@samsung.com>
+Yang Gu <yang.gu@intel.com>
+Yannic Bonenberger <contact@yannic-bonenberger.com>
+Yarin Kaul <yarin.kaul@gmail.com>
+Yash Vempati <vempatiy@amazon.com>
+Ye Liu <cbakgly@gmail.com>
+Yeol Park <peary2@gmail.com>
+Yeonwoo Jo <yeonwoo.jo.92@gmail.com>
+Yi Shen <yi.shen@samsung.com>
+Yi Sun <ratsunny@gmail.com>
+Yichen Jiang <jiangyichen123@gmail.com>
+Yifei Yu <yuyifei@xiaomi.com>
+Yizhou Jiang <yizhou.jiang@intel.com>
+Yoav Weiss <yoav@yoav.ws>
+Yoav Zilberberg <yoav.zilberberg@gmail.com>
+Yong Shin <sy3620@gmail.com>
+Yong Wang <ccyongwang@tencent.com>
+Yongha Lee <yongha78.lee@samsung.com>
+Yongseok Choi <yongseok.choi@navercorp.com>
+Yongsheng Zhu <yongsheng.zhu@intel.com>
+Yoonjae Cho <yoonjae.cho92@gmail.com>
+Yoshinori Sano <yoshinori.sano@gmail.com>
+Youngho Seo <hazivoo@gmail.com>
+Youngjin Choi <cyjin9.yc@gmail.com>
+YoungKi Hong <simon.hong81@gmail.com>
+Youngmin Yoo <youngmin.yoo@samsung.com>
+Youngsoo Choi <kenshin.choi@samsung.com>
+Youngsun Suh <zard17@gmail.com>
+Yuhong Sha <yuhong.sha@samsung.com>
+Yumikiyo Osanai <yumios.art@gmail.com>
+Yunchao He <yunchao.he@intel.com>
+Yupei Lin <yplam@yplam.com>
+Yupei Wang <perryuwang@tencent.com>
+Yura Yaroshevich <yura.yaroshevich@gmail.com>
+Yuri Gorobets <yuri.gorobets@gmail.com>
+Yuriy Taraday <yorik.sar@gmail.com>
+Yuvanesh Natarajan <yuvanesh.n1@samsung.com>
+Zeno Albisser <zeno.albisser@digia.com>
+Zeqin Chen <talonchen@tencent.com>
+Zhaoze Zhou <zhaoze.zhou@partner.samsung.com>
+Zheda Chen <zheda.chen@intel.com>
+Zheng Chuang <zhengchuangscu@gmail.com>
+Zhengkun Li <zhengkli@amazon.com>
+Zhenyu Liang <zhenyu.liang@intel.com>
+Zhenyu Shan <zhenyu.shan@intel.com>
+Zhifei Fang <facetothefate@gmail.com>
+Zhuoyu Qian <zhuoyu.qian@samsung.com>
+Ziran Sun <ziran.sun@samsung.com>
+Zoltan Herczeg <zherczeg.u-szeged@partner.samsung.com>
+Zoltan Kuscsik <zoltan.kuscsik@linaro.org>
+Zsolt Borbely <zsborbely.u-szeged@partner.samsung.com>
+方觉 (Fang Jue) <fangjue23303@gmail.com>
+Rajesh Mahindra <rmahindra@uber.com>
+Yuan-Pin Yu <yjames@uber.com>
+Vinoth Chandar <vinoth@uber.com>
+Zheng Xu <zxu@kobo.com>
+Junsong Li <ljs.darkfish@gmail.com>
+
+ACCESS CO., LTD. <*@access-company.com>
+Akamai Inc. <*@akamai.com>
+ARM Holdings <*@arm.com>
+BlackBerry Limited <*@blackberry.com>
+Bocoup <*@bocoup.com>
+Canonical Limited <*@canonical.com>
+Cloudflare, Inc. <*@cloudflare.com>
+Code Aurora Forum <*@codeaurora.org>
+Collabora Limited <*@collabora.com>
+Comodo CA Limited
+Cosium <*@cosium.com>
+Duck Duck Go, Inc. <*@duckduckgo.com>
+Endless Mobile, Inc. <*@endlessm.com>
+Estimote, Inc. <*@estimote.com>
+Facebook, Inc. <*@fb.com>
+Facebook, Inc. <*@oculus.com>
+Google Inc. <*@google.com>
+Hewlett-Packard Development Company, L.P. <*@hp.com>
+IBM Inc. <*@*.ibm.com>
+IBM Inc. <*@ibm.com>
+Igalia S.L. <*@igalia.com>
+Imagination Technologies Limited <*@imagination.corp-partner.google.com>
+Impossible Dreams Network <*@impossibledreams.net>
+Intel Corporation <*@intel.com>
+LG Electronics, Inc. <*@lge.com>
+Loongson Technology Corporation Limited. <*@loongson.cn>
+Macadamian <*@macadamian.com>
+Mediatek <*@mediatek.com>
+Microsoft <*@microsoft.com>
+MIPS Technologies, Inc. <*@mips.com>
+Mozilla Corporation <*@mozilla.com>
+Neverware Inc. <*@neverware.com>
+NIKE, Inc. <*@nike.com>
+NVIDIA Corporation <*@nvidia.com>
+Opera Software ASA <*@opera.com>
+Optical Tone Ltd <*@opticaltone.com>
+Pengutronix e.K. <*@pengutronix.de>
+Rakuten Kobo Inc. <*@kobo.com>
+Rakuten Kobo Inc. <*@rakuten.com>
+Seznam.cz, a.s. <*@firma.seznam.cz>
+Slack Technologies Inc. <*@slack-corp.com>
+Spotify AB <*@spotify.com>
+Tableau Software <*@tableau.com>
+TeamSpeak Systems GmbH <*@teamspeak.com>
+The Chromium Authors <*@chromium.org>
+The MathWorks, Inc. <binod.pant@mathworks.com>
+Torchmobile Inc.
+Upwork <*@cloud.upwork.com>
+Venture 3 Systems LLC <*@venture3systems.com>
+Vewd Software AS <*@vewd.com>
+Vivaldi Technologies AS <*@vivaldi.com>
+Yandex LLC <*@yandex-team.ru>
+Make Positive Provar Limited <*@provartesting.com>
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a32e00c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..2823b98
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,5 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+workspace(name = "com_google_googleurl")
diff --git a/base/BUILD b/base/BUILD
new file mode 100644
index 0000000..a9ca0e6
--- /dev/null
+++ b/base/BUILD
@@ -0,0 +1,21 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "base",
+ hdrs = [
+ "compiler_specific.h",
+ "debug/leak_annotations.h",
+ "macros.h",
+ "no_destructor.h",
+ "optional.h",
+ "stl_util.h",
+ "template_util.h",
+ ],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//build:build_config",
+ "//polyfills",
+ ],
+)
diff --git a/base/compiler_specific.h b/base/compiler_specific.h
new file mode 100644
index 0000000..7e2c510
--- /dev/null
+++ b/base/compiler_specific.h
@@ -0,0 +1,263 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_COMPILER_SPECIFIC_H_
+#define BASE_COMPILER_SPECIFIC_H_
+
+#include "build/build_config.h"
+
+#if defined(COMPILER_MSVC)
+
+#if !defined(__clang__)
+#error "Only clang-cl is supported on Windows, see https://crbug.com/988071"
+#endif
+
+// Macros for suppressing and disabling warnings on MSVC.
+//
+// Warning numbers are enumerated at:
+// http://msdn.microsoft.com/en-us/library/8x5x43k7(VS.80).aspx
+//
+// The warning pragma:
+// http://msdn.microsoft.com/en-us/library/2c8f766e(VS.80).aspx
+//
+// Using __pragma instead of #pragma inside macros:
+// http://msdn.microsoft.com/en-us/library/d9x1s805.aspx
+
+// MSVC_PUSH_DISABLE_WARNING pushes |n| onto a stack of warnings to be disabled.
+// The warning remains disabled until popped by MSVC_POP_WARNING.
+#define MSVC_PUSH_DISABLE_WARNING(n) __pragma(warning(push)) \
+ __pragma(warning(disable:n))
+
+// Pop effects of innermost MSVC_PUSH_* macro.
+#define MSVC_POP_WARNING() __pragma(warning(pop))
+
+#else // Not MSVC
+
+#define MSVC_PUSH_DISABLE_WARNING(n)
+#define MSVC_POP_WARNING()
+#define MSVC_DISABLE_OPTIMIZE()
+#define MSVC_ENABLE_OPTIMIZE()
+
+#endif // COMPILER_MSVC
+
+// These macros can be helpful when investigating compiler bugs or when
+// investigating issues in local optimized builds, by temporarily disabling
+// optimizations for a single function or file. These macros should never be
+// used to permanently work around compiler bugs or other mysteries, and should
+// not be used in landed changes.
+#if !defined(OFFICIAL_BUILD)
+#if defined(__clang__)
+#define DISABLE_OPTIMIZE() __pragma(clang optimize off)
+#define ENABLE_OPTIMIZE() __pragma(clang optimize on)
+#elif defined(COMPILER_MSVC)
+#define DISABLE_OPTIMIZE() __pragma(optimize("", off))
+#define ENABLE_OPTIMIZE() __pragma(optimize("", on))
+#else
+// These macros are not currently available for other compiler options.
+#endif
+// These macros are not available in official builds.
+#endif // !defined(OFFICIAL_BUILD)
+
+// Annotate a variable indicating it's ok if the variable is not used.
+// (Typically used to silence a compiler warning when the assignment
+// is important for some other reason.)
+// Use like:
+// int x = ...;
+// ALLOW_UNUSED_LOCAL(x);
+#define ALLOW_UNUSED_LOCAL(x) (void)x
+
+// Annotate a typedef or function indicating it's ok if it's not used.
+// Use like:
+// typedef Foo Bar ALLOW_UNUSED_TYPE;
+#if defined(COMPILER_GCC) || defined(__clang__)
+#define ALLOW_UNUSED_TYPE __attribute__((unused))
+#else
+#define ALLOW_UNUSED_TYPE
+#endif
+
+// Annotate a function indicating it should not be inlined.
+// Use like:
+// NOINLINE void DoStuff() { ... }
+#if defined(COMPILER_GCC)
+#define NOINLINE __attribute__((noinline))
+#elif defined(COMPILER_MSVC)
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+
+#if defined(COMPILER_GCC) && defined(NDEBUG)
+#define ALWAYS_INLINE inline __attribute__((__always_inline__))
+#elif defined(COMPILER_MSVC) && defined(NDEBUG)
+#define ALWAYS_INLINE __forceinline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+// Specify memory alignment for structs, classes, etc.
+// Use like:
+// class ALIGNAS(16) MyClass { ... }
+// ALIGNAS(16) int array[4];
+//
+// In most places you can use the C++11 keyword "alignas", which is preferred.
+//
+// But compilers have trouble mixing __attribute__((...)) syntax with
+// alignas(...) syntax.
+//
+// Doesn't work in clang or gcc:
+// struct alignas(16) __attribute__((packed)) S { char c; };
+// Works in clang but not gcc:
+// struct __attribute__((packed)) alignas(16) S2 { char c; };
+// Works in clang and gcc:
+// struct alignas(16) S3 { char c; } __attribute__((packed));
+//
+// There are also some attributes that must be specified *before* a class
+// definition: visibility (used for exporting functions/classes) is one of
+// these attributes. This means that it is not possible to use alignas() with a
+// class that is marked as exported.
+#if defined(COMPILER_MSVC)
+#define ALIGNAS(byte_alignment) __declspec(align(byte_alignment))
+#elif defined(COMPILER_GCC)
+#define ALIGNAS(byte_alignment) __attribute__((aligned(byte_alignment)))
+#endif
+
+// Annotate a function indicating the caller must examine the return value.
+// Use like:
+// int foo() WARN_UNUSED_RESULT;
+// To explicitly ignore a result, see |ignore_result()| in base/macros.h.
+#undef WARN_UNUSED_RESULT
+#if defined(COMPILER_GCC) || defined(__clang__)
+#define WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+#define WARN_UNUSED_RESULT
+#endif
+
+// Tell the compiler a function is using a printf-style format string.
+// |format_param| is the one-based index of the format string parameter;
+// |dots_param| is the one-based index of the "..." parameter.
+// For v*printf functions (which take a va_list), pass 0 for dots_param.
+// (This is undocumented but matches what the system C headers do.)
+// For member functions, the implicit this parameter counts as index 1.
+#if defined(COMPILER_GCC) || defined(__clang__)
+#define PRINTF_FORMAT(format_param, dots_param) \
+ __attribute__((format(printf, format_param, dots_param)))
+#else
+#define PRINTF_FORMAT(format_param, dots_param)
+#endif
+
+// WPRINTF_FORMAT is the same, but for wide format strings.
+// This doesn't appear to yet be implemented in any compiler.
+// See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38308 .
+#define WPRINTF_FORMAT(format_param, dots_param)
+// If available, it would look like:
+// __attribute__((format(wprintf, format_param, dots_param)))
+
+// Sanitizers annotations.
+#if defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define NO_SANITIZE(what) __attribute__((no_sanitize(what)))
+#endif
+#endif
+#if !defined(NO_SANITIZE)
+#define NO_SANITIZE(what)
+#endif
+
+// MemorySanitizer annotations.
+#if defined(MEMORY_SANITIZER) && !defined(OS_NACL)
+#include <sanitizer/msan_interface.h>
+
+// Mark a memory region fully initialized.
+// Use this to annotate code that deliberately reads uninitialized data, for
+// example a GC scavenging root set pointers from the stack.
+#define MSAN_UNPOISON(p, size) __msan_unpoison(p, size)
+
+// Check a memory region for initializedness, as if it was being used here.
+// If any bits are uninitialized, crash with an MSan report.
+// Use this to sanitize data which MSan won't be able to track, e.g. before
+// passing data to another process via shared memory.
+#define MSAN_CHECK_MEM_IS_INITIALIZED(p, size) \
+ __msan_check_mem_is_initialized(p, size)
+#else // MEMORY_SANITIZER
+#define MSAN_UNPOISON(p, size)
+#define MSAN_CHECK_MEM_IS_INITIALIZED(p, size)
+#endif // MEMORY_SANITIZER
+
+// DISABLE_CFI_PERF -- Disable Control Flow Integrity for perf reasons.
+#if !defined(DISABLE_CFI_PERF)
+#if defined(__clang__) && defined(OFFICIAL_BUILD)
+#define DISABLE_CFI_PERF __attribute__((no_sanitize("cfi")))
+#else
+#define DISABLE_CFI_PERF
+#endif
+#endif
+
+// Macro useful for writing cross-platform function pointers.
+#if !defined(CDECL)
+#if defined(OS_WIN)
+#define CDECL __cdecl
+#else // defined(OS_WIN)
+#define CDECL
+#endif // defined(OS_WIN)
+#endif // !defined(CDECL)
+
+// Macro for hinting that an expression is likely to be false.
+#if !defined(UNLIKELY)
+#if defined(COMPILER_GCC) || defined(__clang__)
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define UNLIKELY(x) (x)
+#endif // defined(COMPILER_GCC)
+#endif // !defined(UNLIKELY)
+
+#if !defined(LIKELY)
+#if defined(COMPILER_GCC) || defined(__clang__)
+#define LIKELY(x) __builtin_expect(!!(x), 1)
+#else
+#define LIKELY(x) (x)
+#endif // defined(COMPILER_GCC)
+#endif // !defined(LIKELY)
+
+// Compiler feature-detection.
+// clang.llvm.org/docs/LanguageExtensions.html#has-feature-and-has-extension
+#if defined(__has_feature)
+#define HAS_FEATURE(FEATURE) __has_feature(FEATURE)
+#else
+#define HAS_FEATURE(FEATURE) 0
+#endif
+
+// Macro for telling -Wimplicit-fallthrough that a fallthrough is intentional.
+#if defined(__clang__)
+#define FALLTHROUGH [[clang::fallthrough]]
+#else
+#define FALLTHROUGH
+#endif
+
+#if defined(COMPILER_GCC)
+#define PRETTY_FUNCTION __PRETTY_FUNCTION__
+#elif defined(COMPILER_MSVC)
+#define PRETTY_FUNCTION __FUNCSIG__
+#else
+// See https://en.cppreference.com/w/c/language/function_definition#func
+#define PRETTY_FUNCTION __func__
+#endif
+
+#if !defined(CPU_ARM_NEON)
+#if defined(__arm__)
+#if !defined(__ARMEB__) && !defined(__ARM_EABI__) && !defined(__EABI__) && \
+ !defined(__VFP_FP__) && !defined(_WIN32_WCE) && !defined(ANDROID)
+#error Chromium does not support middle endian architecture
+#endif
+#if defined(__ARM_NEON__)
+#define CPU_ARM_NEON 1
+#endif
+#endif // defined(__arm__)
+#endif // !defined(CPU_ARM_NEON)
+
+#if !defined(HAVE_MIPS_MSA_INTRINSICS)
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define HAVE_MIPS_MSA_INTRINSICS 1
+#endif
+#endif
+
+#endif // BASE_COMPILER_SPECIFIC_H_
diff --git a/base/debug/leak_annotations.h b/base/debug/leak_annotations.h
new file mode 100644
index 0000000..dc50246
--- /dev/null
+++ b/base/debug/leak_annotations.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_DEBUG_LEAK_ANNOTATIONS_H_
+#define BASE_DEBUG_LEAK_ANNOTATIONS_H_
+
+#include "base/macros.h"
+#include "build/build_config.h"
+
+// This file defines macros which can be used to annotate intentional memory
+// leaks. Support for annotations is implemented in LeakSanitizer. Annotated
+// objects will be treated as a source of live pointers, i.e. any heap objects
+// reachable by following pointers from an annotated object will not be
+// reported as leaks.
+//
+// ANNOTATE_SCOPED_MEMORY_LEAK: all allocations made in the current scope
+// will be annotated as leaks.
+// ANNOTATE_LEAKING_OBJECT_PTR(X): the heap object referenced by pointer X will
+// be annotated as a leak.
+
+#if defined(LEAK_SANITIZER) && !defined(OS_NACL)
+
+#include <sanitizer/lsan_interface.h>
+
+class ScopedLeakSanitizerDisabler {
+ public:
+ ScopedLeakSanitizerDisabler() { __lsan_disable(); }
+ ~ScopedLeakSanitizerDisabler() { __lsan_enable(); }
+ private:
+ DISALLOW_COPY_AND_ASSIGN(ScopedLeakSanitizerDisabler);
+};
+
+#define ANNOTATE_SCOPED_MEMORY_LEAK \
+ ScopedLeakSanitizerDisabler leak_sanitizer_disabler; static_cast<void>(0)
+
+#define ANNOTATE_LEAKING_OBJECT_PTR(X) __lsan_ignore_object(X);
+
+#else
+
+#define ANNOTATE_SCOPED_MEMORY_LEAK ((void)0)
+#define ANNOTATE_LEAKING_OBJECT_PTR(X) ((void)0)
+
+#endif
+
+#endif // BASE_DEBUG_LEAK_ANNOTATIONS_H_
diff --git a/base/macros.h b/base/macros.h
new file mode 100644
index 0000000..cda8e3a
--- /dev/null
+++ b/base/macros.h
@@ -0,0 +1,44 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file contains macros and macro-like constructs (e.g., templates) that
+// are commonly used throughout Chromium source. (It may also contain things
+// that are closely related to things that are commonly used that belong in this
+// file.)
+
+#ifndef BASE_MACROS_H_
+#define BASE_MACROS_H_
+
+// Put this in the declarations for a class to be uncopyable.
+#define DISALLOW_COPY(TypeName) \
+ TypeName(const TypeName&) = delete
+
+// Put this in the declarations for a class to be unassignable.
+#define DISALLOW_ASSIGN(TypeName) TypeName& operator=(const TypeName&) = delete
+
+// Put this in the declarations for a class to be uncopyable and unassignable.
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+ DISALLOW_COPY(TypeName); \
+ DISALLOW_ASSIGN(TypeName)
+
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+// This is especially useful for classes containing only static methods.
+#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+ TypeName() = delete; \
+ DISALLOW_COPY_AND_ASSIGN(TypeName)
+
+// Used to explicitly mark the return value of a function as unused. If you are
+// really sure you don't want to do anything with the return value of a function
+// that has been marked WARN_UNUSED_RESULT, wrap it with this. Example:
+//
+// std::unique_ptr<MyType> my_var = ...;
+// if (TakeOwnership(my_var.get()) == SUCCESS)
+// ignore_result(my_var.release());
+//
+template<typename T>
+inline void ignore_result(const T&) {
+}
+
+#endif // BASE_MACROS_H_
diff --git a/base/no_destructor.h b/base/no_destructor.h
new file mode 100644
index 0000000..3d7a85c
--- /dev/null
+++ b/base/no_destructor.h
@@ -0,0 +1,98 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_NO_DESTRUCTOR_H_
+#define BASE_NO_DESTRUCTOR_H_
+
+#include <new>
+#include <utility>
+
+namespace gurl_base {
+
+// A wrapper that makes it easy to create an object of type T with static
+// storage duration that:
+// - is only constructed on first access
+// - never invokes the destructor
+// in order to satisfy the styleguide ban on global constructors and
+// destructors.
+//
+// Runtime constant example:
+// const std::string& GetLineSeparator() {
+// // Forwards to std::string(size_t, char, const Allocator&) constructor.
+// static const gurl_base::NoDestructor<std::string> s(5, '-');
+// return *s;
+// }
+//
+// More complex initialization with a lambda:
+// const std::string& GetSessionNonce() {
+// static const gurl_base::NoDestructor<std::string> nonce([] {
+// std::string s(16);
+// crypto::RandString(s.data(), s.size());
+// return s;
+// }());
+// return *nonce;
+// }
+//
+// NoDestructor<T> stores the object inline, so it also avoids a pointer
+// indirection and a malloc. Also note that since C++11 static local variable
+// initialization is thread-safe and so is this pattern. Code should prefer to
+// use NoDestructor<T> over:
+// - A function scoped static T* or T& that is dynamically initialized.
+// - A global gurl_base::LazyInstance<T>.
+//
+// Note that since the destructor is never run, this *will* leak memory if used
+// as a stack or member variable. Furthermore, a NoDestructor<T> should never
+// have global scope as that may require a static initializer.
+template <typename T>
+class NoDestructor {
+ public:
+ // Not constexpr; just write static constexpr T x = ...; if the value should
+ // be a constexpr.
+ template <typename... Args>
+ explicit NoDestructor(Args&&... args) {
+ new (storage_) T(std::forward<Args>(args)...);
+ }
+
+ // Allows copy and move construction of the contained type, to allow
+ // construction from an initializer list, e.g. for std::vector.
+ explicit NoDestructor(const T& x) { new (storage_) T(x); }
+ explicit NoDestructor(T&& x) { new (storage_) T(std::move(x)); }
+
+ NoDestructor(const NoDestructor&) = delete;
+ NoDestructor& operator=(const NoDestructor&) = delete;
+
+ ~NoDestructor() = default;
+
+ const T& operator*() const { return *get(); }
+ T& operator*() { return *get(); }
+
+ const T* operator->() const { return get(); }
+ T* operator->() { return get(); }
+
+ const T* get() const { return reinterpret_cast<const T*>(storage_); }
+ T* get() { return reinterpret_cast<T*>(storage_); }
+
+ private:
+ alignas(T) char storage_[sizeof(T)];
+
+#if defined(LEAK_SANITIZER)
+ // TODO(https://crbug.com/812277): This is a hack to work around the fact
+ // that LSan doesn't seem to treat NoDestructor as a root for reachability
+ // analysis. This means that code like this:
+ // static gurl_base::NoDestructor<std::vector<int>> v({1, 2, 3});
+ // is considered a leak. Using the standard leak sanitizer annotations to
+ // suppress leaks doesn't work: std::vector is implicitly constructed before
+ // calling the gurl_base::NoDestructor constructor.
+ //
+ // Unfortunately, I haven't been able to demonstrate this issue in simpler
+ // reproductions: until that's resolved, hold an explicit pointer to the
+ // placement-new'd object in leak sanitizer mode to help LSan realize that
+ // objects allocated by the contained type are still reachable.
+ T* storage_ptr_ = reinterpret_cast<T*>(storage_);
+#endif // defined(LEAK_SANITIZER)
+};
+
+} // namespace base
+
+#endif // BASE_NO_DESTRUCTOR_H_
diff --git a/base/optional.h b/base/optional.h
new file mode 100644
index 0000000..345147c
--- /dev/null
+++ b/base/optional.h
@@ -0,0 +1,937 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_OPTIONAL_H_
+#define BASE_OPTIONAL_H_
+
+#include <functional>
+#include <type_traits>
+#include <utility>
+
+#include "polyfills/base/logging.h"
+#include "base/template_util.h"
+
+namespace gurl_base {
+
+// Specification:
+// http://en.cppreference.com/w/cpp/utility/optional/nullopt_t
+struct nullopt_t {
+ constexpr explicit nullopt_t(int) {}
+};
+
+// Specification:
+// http://en.cppreference.com/w/cpp/utility/optional/nullopt
+constexpr nullopt_t nullopt(0);
+
+// Forward declaration, which is refered by following helpers.
+template <typename T>
+class Optional;
+
+namespace internal {
+
+template <typename T, bool = std::is_trivially_destructible<T>::value>
+struct OptionalStorageBase {
+ // Initializing |empty_| here instead of using default member initializing
+ // to avoid errors in g++ 4.8.
+ constexpr OptionalStorageBase() : empty_('\0') {}
+
+ template <class... Args>
+ constexpr explicit OptionalStorageBase(in_place_t, Args&&... args)
+ : is_populated_(true), value_(std::forward<Args>(args)...) {}
+
+ // When T is not trivially destructible we must call its
+ // destructor before deallocating its memory.
+ // Note that this hides the (implicitly declared) move constructor, which
+ // would be used for constexpr move constructor in OptionalStorage<T>.
+ // It is needed iff T is trivially move constructible. However, the current
+ // is_trivially_{copy,move}_constructible implementation requires
+ // is_trivially_destructible (which looks a bug, cf:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51452 and
+ // http://cplusplus.github.io/LWG/lwg-active.html#2116), so it is not
+ // necessary for this case at the moment. Please see also the destructor
+ // comment in "is_trivially_destructible = true" specialization below.
+ ~OptionalStorageBase() {
+ if (is_populated_)
+ value_.~T();
+ }
+
+ template <class... Args>
+ void Init(Args&&... args) {
+ GURL_DCHECK(!is_populated_);
+ ::new (&value_) T(std::forward<Args>(args)...);
+ is_populated_ = true;
+ }
+
+ bool is_populated_ = false;
+ union {
+ // |empty_| exists so that the union will always be initialized, even when
+ // it doesn't contain a value. Union members must be initialized for the
+ // constructor to be 'constexpr'.
+ char empty_;
+ T value_;
+ };
+};
+
+template <typename T>
+struct OptionalStorageBase<T, true /* trivially destructible */> {
+ // Initializing |empty_| here instead of using default member initializing
+ // to avoid errors in g++ 4.8.
+ constexpr OptionalStorageBase() : empty_('\0') {}
+
+ template <class... Args>
+ constexpr explicit OptionalStorageBase(in_place_t, Args&&... args)
+ : is_populated_(true), value_(std::forward<Args>(args)...) {}
+
+ // When T is trivially destructible (i.e. its destructor does nothing) there
+ // is no need to call it. Implicitly defined destructor is trivial, because
+ // both members (bool and union containing only variants which are trivially
+ // destructible) are trivially destructible.
+ // Explicitly-defaulted destructor is also trivial, but do not use it here,
+ // because it hides the implicit move constructor. It is needed to implement
+ // constexpr move constructor in OptionalStorage iff T is trivially move
+ // constructible. Note that, if T is trivially move constructible, the move
+ // constructor of OptionalStorageBase<T> is also implicitly defined and it is
+ // trivially move constructor. If T is not trivially move constructible,
+ // "not declaring move constructor without destructor declaration" here means
+ // "delete move constructor", which works because any move constructor of
+ // OptionalStorage will not refer to it in that case.
+
+ template <class... Args>
+ void Init(Args&&... args) {
+ GURL_DCHECK(!is_populated_);
+ ::new (&value_) T(std::forward<Args>(args)...);
+ is_populated_ = true;
+ }
+
+ bool is_populated_ = false;
+ union {
+ // |empty_| exists so that the union will always be initialized, even when
+ // it doesn't contain a value. Union members must be initialized for the
+ // constructor to be 'constexpr'.
+ char empty_;
+ T value_;
+ };
+};
+
+// Implement conditional constexpr copy and move constructors. These are
+// constexpr if is_trivially_{copy,move}_constructible<T>::value is true
+// respectively. If each is true, the corresponding constructor is defined as
+// "= default;", which generates a constexpr constructor (In this case,
+// the condition of constexpr-ness is satisfied because the base class also has
+// compiler generated constexpr {copy,move} constructors). Note that
+// placement-new is prohibited in constexpr.
+template <typename T,
+ bool = is_trivially_copy_constructible<T>::value,
+ bool = std::is_trivially_move_constructible<T>::value>
+struct OptionalStorage : OptionalStorageBase<T> {
+ // This is no trivially {copy,move} constructible case. Other cases are
+ // defined below as specializations.
+
+ // Accessing the members of template base class requires explicit
+ // declaration.
+ using OptionalStorageBase<T>::is_populated_;
+ using OptionalStorageBase<T>::value_;
+ using OptionalStorageBase<T>::Init;
+
+ // Inherit constructors (specifically, the in_place constructor).
+ using OptionalStorageBase<T>::OptionalStorageBase;
+
+ // User defined constructor deletes the default constructor.
+ // Define it explicitly.
+ OptionalStorage() = default;
+
+ OptionalStorage(const OptionalStorage& other) {
+ if (other.is_populated_)
+ Init(other.value_);
+ }
+
+ OptionalStorage(OptionalStorage&& other) noexcept(
+ std::is_nothrow_move_constructible<T>::value) {
+ if (other.is_populated_)
+ Init(std::move(other.value_));
+ }
+};
+
+template <typename T>
+struct OptionalStorage<T,
+ true /* trivially copy constructible */,
+ false /* trivially move constructible */>
+ : OptionalStorageBase<T> {
+ using OptionalStorageBase<T>::is_populated_;
+ using OptionalStorageBase<T>::value_;
+ using OptionalStorageBase<T>::Init;
+ using OptionalStorageBase<T>::OptionalStorageBase;
+
+ OptionalStorage() = default;
+ OptionalStorage(const OptionalStorage& other) = default;
+
+ OptionalStorage(OptionalStorage&& other) noexcept(
+ std::is_nothrow_move_constructible<T>::value) {
+ if (other.is_populated_)
+ Init(std::move(other.value_));
+ }
+};
+
+template <typename T>
+struct OptionalStorage<T,
+ false /* trivially copy constructible */,
+ true /* trivially move constructible */>
+ : OptionalStorageBase<T> {
+ using OptionalStorageBase<T>::is_populated_;
+ using OptionalStorageBase<T>::value_;
+ using OptionalStorageBase<T>::Init;
+ using OptionalStorageBase<T>::OptionalStorageBase;
+
+ OptionalStorage() = default;
+ OptionalStorage(OptionalStorage&& other) = default;
+
+ OptionalStorage(const OptionalStorage& other) {
+ if (other.is_populated_)
+ Init(other.value_);
+ }
+};
+
+template <typename T>
+struct OptionalStorage<T,
+ true /* trivially copy constructible */,
+ true /* trivially move constructible */>
+ : OptionalStorageBase<T> {
+ // If both trivially {copy,move} constructible are true, it is not necessary
+ // to use user-defined constructors. So, just inheriting constructors
+ // from the base class works.
+ using OptionalStorageBase<T>::OptionalStorageBase;
+};
+
+// Base class to support conditionally usable copy-/move- constructors
+// and assign operators.
+template <typename T>
+class OptionalBase {
+ // This class provides implementation rather than public API, so everything
+ // should be hidden. Often we use composition, but we cannot in this case
+ // because of C++ language restriction.
+ protected:
+ constexpr OptionalBase() = default;
+ constexpr OptionalBase(const OptionalBase& other) = default;
+ constexpr OptionalBase(OptionalBase&& other) = default;
+
+ template <class... Args>
+ constexpr explicit OptionalBase(in_place_t, Args&&... args)
+ : storage_(in_place, std::forward<Args>(args)...) {}
+
+ // Implementation of converting constructors.
+ template <typename U>
+ explicit OptionalBase(const OptionalBase<U>& other) {
+ if (other.storage_.is_populated_)
+ storage_.Init(other.storage_.value_);
+ }
+
+ template <typename U>
+ explicit OptionalBase(OptionalBase<U>&& other) {
+ if (other.storage_.is_populated_)
+ storage_.Init(std::move(other.storage_.value_));
+ }
+
+ ~OptionalBase() = default;
+
+ OptionalBase& operator=(const OptionalBase& other) {
+ CopyAssign(other);
+ return *this;
+ }
+
+ OptionalBase& operator=(OptionalBase&& other) noexcept(
+ std::is_nothrow_move_assignable<T>::value&&
+ std::is_nothrow_move_constructible<T>::value) {
+ MoveAssign(std::move(other));
+ return *this;
+ }
+
+ template <typename U>
+ void CopyAssign(const OptionalBase<U>& other) {
+ if (other.storage_.is_populated_)
+ InitOrAssign(other.storage_.value_);
+ else
+ FreeIfNeeded();
+ }
+
+ template <typename U>
+ void MoveAssign(OptionalBase<U>&& other) {
+ if (other.storage_.is_populated_)
+ InitOrAssign(std::move(other.storage_.value_));
+ else
+ FreeIfNeeded();
+ }
+
+ template <typename U>
+ void InitOrAssign(U&& value) {
+ if (storage_.is_populated_)
+ storage_.value_ = std::forward<U>(value);
+ else
+ storage_.Init(std::forward<U>(value));
+ }
+
+ void FreeIfNeeded() {
+ if (!storage_.is_populated_)
+ return;
+ storage_.value_.~T();
+ storage_.is_populated_ = false;
+ }
+
+ // For implementing conversion, allow access to other typed OptionalBase
+ // class.
+ template <typename U>
+ friend class OptionalBase;
+
+ OptionalStorage<T> storage_;
+};
+
+// The following {Copy,Move}{Constructible,Assignable} structs are helpers to
+// implement constructor/assign-operator overloading. Specifically, if T is
+// is not movable but copyable, Optional<T>'s move constructor should not
+// participate in overload resolution. This inheritance trick implements that.
+template <bool is_copy_constructible>
+struct CopyConstructible {};
+
+template <>
+struct CopyConstructible<false> {
+ constexpr CopyConstructible() = default;
+ constexpr CopyConstructible(const CopyConstructible&) = delete;
+ constexpr CopyConstructible(CopyConstructible&&) = default;
+ CopyConstructible& operator=(const CopyConstructible&) = default;
+ CopyConstructible& operator=(CopyConstructible&&) = default;
+};
+
+template <bool is_move_constructible>
+struct MoveConstructible {};
+
+template <>
+struct MoveConstructible<false> {
+ constexpr MoveConstructible() = default;
+ constexpr MoveConstructible(const MoveConstructible&) = default;
+ constexpr MoveConstructible(MoveConstructible&&) = delete;
+ MoveConstructible& operator=(const MoveConstructible&) = default;
+ MoveConstructible& operator=(MoveConstructible&&) = default;
+};
+
+template <bool is_copy_assignable>
+struct CopyAssignable {};
+
+template <>
+struct CopyAssignable<false> {
+ constexpr CopyAssignable() = default;
+ constexpr CopyAssignable(const CopyAssignable&) = default;
+ constexpr CopyAssignable(CopyAssignable&&) = default;
+ CopyAssignable& operator=(const CopyAssignable&) = delete;
+ CopyAssignable& operator=(CopyAssignable&&) = default;
+};
+
+template <bool is_move_assignable>
+struct MoveAssignable {};
+
+template <>
+struct MoveAssignable<false> {
+ constexpr MoveAssignable() = default;
+ constexpr MoveAssignable(const MoveAssignable&) = default;
+ constexpr MoveAssignable(MoveAssignable&&) = default;
+ MoveAssignable& operator=(const MoveAssignable&) = default;
+ MoveAssignable& operator=(MoveAssignable&&) = delete;
+};
+
+// Helper to conditionally enable converting constructors and assign operators.
+template <typename T, typename U>
+struct IsConvertibleFromOptional
+ : std::integral_constant<
+ bool,
+ std::is_constructible<T, Optional<U>&>::value ||
+ std::is_constructible<T, const Optional<U>&>::value ||
+ std::is_constructible<T, Optional<U>&&>::value ||
+ std::is_constructible<T, const Optional<U>&&>::value ||
+ std::is_convertible<Optional<U>&, T>::value ||
+ std::is_convertible<const Optional<U>&, T>::value ||
+ std::is_convertible<Optional<U>&&, T>::value ||
+ std::is_convertible<const Optional<U>&&, T>::value> {};
+
+template <typename T, typename U>
+struct IsAssignableFromOptional
+ : std::integral_constant<
+ bool,
+ IsConvertibleFromOptional<T, U>::value ||
+ std::is_assignable<T&, Optional<U>&>::value ||
+ std::is_assignable<T&, const Optional<U>&>::value ||
+ std::is_assignable<T&, Optional<U>&&>::value ||
+ std::is_assignable<T&, const Optional<U>&&>::value> {};
+
+// Forward compatibility for C++17.
+// Introduce one more deeper nested namespace to avoid leaking using std::swap.
+namespace swappable_impl {
+using std::swap;
+
+struct IsSwappableImpl {
+ // Tests if swap can be called. Check<T&>(0) returns true_type iff swap
+ // is available for T. Otherwise, Check's overload resolution falls back
+ // to Check(...) declared below thanks to SFINAE, so returns false_type.
+ template <typename T>
+ static auto Check(int)
+ -> decltype(swap(std::declval<T>(), std::declval<T>()), std::true_type());
+
+ template <typename T>
+ static std::false_type Check(...);
+};
+} // namespace swappable_impl
+
+template <typename T>
+struct IsSwappable : decltype(swappable_impl::IsSwappableImpl::Check<T&>(0)) {};
+
+// Forward compatibility for C++20.
+template <typename T>
+using RemoveCvRefT = std::remove_cv_t<std::remove_reference_t<T>>;
+
+} // namespace internal
+
+// On Windows, by default, empty-base class optimization does not work,
+// which means even if the base class is empty struct, it still consumes one
+// byte for its body. __declspec(empty_bases) enables the optimization.
+// cf)
+// https://blogs.msdn.microsoft.com/vcblog/2016/03/30/optimizing-the-layout-of-empty-base-classes-in-vs2015-update-2-3/
+#ifdef OS_WIN
+#define OPTIONAL_DECLSPEC_EMPTY_BASES __declspec(empty_bases)
+#else
+#define OPTIONAL_DECLSPEC_EMPTY_BASES
+#endif
+
+// gurl_base::Optional is a Chromium version of the C++17 optional class:
+// std::optional documentation:
+// http://en.cppreference.com/w/cpp/utility/optional
+// Chromium documentation:
+// https://chromium.googlesource.com/chromium/src/+/master/docs/optional.md
+//
+// These are the differences between the specification and the implementation:
+// - Constructors do not use 'constexpr' as it is a C++14 extension.
+// - 'constexpr' might be missing in some places for reasons specified locally.
+// - No exceptions are thrown, because they are banned from Chromium.
+// Marked noexcept for only move constructor and move assign operators.
+// - All the non-members are in the 'base' namespace instead of 'std'.
+//
+// Note that T cannot have a constructor T(Optional<T>) etc. Optional<T> checks
+// T's constructor (specifically via IsConvertibleFromOptional), and in the
+// check whether T can be constructible from Optional<T>, which is recursive
+// so it does not work. As of Feb 2018, std::optional C++17 implementation in
+// both clang and gcc has same limitation. MSVC SFINAE looks to have different
+// behavior, but anyway it reports an error, too.
+template <typename T>
+class OPTIONAL_DECLSPEC_EMPTY_BASES Optional
+ : public internal::OptionalBase<T>,
+ public internal::CopyConstructible<std::is_copy_constructible<T>::value>,
+ public internal::MoveConstructible<std::is_move_constructible<T>::value>,
+ public internal::CopyAssignable<std::is_copy_constructible<T>::value &&
+ std::is_copy_assignable<T>::value>,
+ public internal::MoveAssignable<std::is_move_constructible<T>::value &&
+ std::is_move_assignable<T>::value> {
+ private:
+ // Disable some versions of T that are ill-formed.
+ // See: https://timsong-cpp.github.io/cppwp/n4659/optional#syn-1
+ static_assert(
+ !std::is_same<internal::RemoveCvRefT<T>, in_place_t>::value,
+ "instantiation of gurl_base::Optional with in_place_t is ill-formed");
+ static_assert(!std::is_same<internal::RemoveCvRefT<T>, nullopt_t>::value,
+ "instantiation of gurl_base::Optional with nullopt_t is ill-formed");
+ static_assert(
+ !std::is_reference<T>::value,
+ "instantiation of gurl_base::Optional with a reference type is ill-formed");
+ // See: https://timsong-cpp.github.io/cppwp/n4659/optional#optional-3
+ static_assert(std::is_destructible<T>::value,
+ "instantiation of gurl_base::Optional with a non-destructible type "
+ "is ill-formed");
+ // Arrays are explicitly disallowed because for arrays of known bound
+ // is_destructible is of undefined value.
+ // See: https://en.cppreference.com/w/cpp/types/is_destructible
+ static_assert(
+ !std::is_array<T>::value,
+ "instantiation of gurl_base::Optional with an array type is ill-formed");
+
+ public:
+#undef OPTIONAL_DECLSPEC_EMPTY_BASES
+ using value_type = T;
+
+ // Defer default/copy/move constructor implementation to OptionalBase.
+ constexpr Optional() = default;
+ constexpr Optional(const Optional& other) = default;
+ constexpr Optional(Optional&& other) noexcept(
+ std::is_nothrow_move_constructible<T>::value) = default;
+
+ constexpr Optional(nullopt_t) {} // NOLINT(runtime/explicit)
+
+ // Converting copy constructor. "explicit" only if
+ // std::is_convertible<const U&, T>::value is false. It is implemented by
+ // declaring two almost same constructors, but that condition in enable_if_t
+ // is different, so that either one is chosen, thanks to SFINAE.
+ template <
+ typename U,
+ std::enable_if_t<std::is_constructible<T, const U&>::value &&
+ !internal::IsConvertibleFromOptional<T, U>::value &&
+ std::is_convertible<const U&, T>::value,
+ bool> = false>
+ Optional(const Optional<U>& other) : internal::OptionalBase<T>(other) {}
+
+ template <
+ typename U,
+ std::enable_if_t<std::is_constructible<T, const U&>::value &&
+ !internal::IsConvertibleFromOptional<T, U>::value &&
+ !std::is_convertible<const U&, T>::value,
+ bool> = false>
+ explicit Optional(const Optional<U>& other)
+ : internal::OptionalBase<T>(other) {}
+
+ // Converting move constructor. Similar to converting copy constructor,
+ // declaring two (explicit and non-explicit) constructors.
+ template <
+ typename U,
+ std::enable_if_t<std::is_constructible<T, U&&>::value &&
+ !internal::IsConvertibleFromOptional<T, U>::value &&
+ std::is_convertible<U&&, T>::value,
+ bool> = false>
+ Optional(Optional<U>&& other) : internal::OptionalBase<T>(std::move(other)) {}
+
+ template <
+ typename U,
+ std::enable_if_t<std::is_constructible<T, U&&>::value &&
+ !internal::IsConvertibleFromOptional<T, U>::value &&
+ !std::is_convertible<U&&, T>::value,
+ bool> = false>
+ explicit Optional(Optional<U>&& other)
+ : internal::OptionalBase<T>(std::move(other)) {}
+
+ template <class... Args>
+ constexpr explicit Optional(in_place_t, Args&&... args)
+ : internal::OptionalBase<T>(in_place, std::forward<Args>(args)...) {}
+
+ template <
+ class U,
+ class... Args,
+ class = std::enable_if_t<std::is_constructible<value_type,
+ std::initializer_list<U>&,
+ Args...>::value>>
+ constexpr explicit Optional(in_place_t,
+ std::initializer_list<U> il,
+ Args&&... args)
+ : internal::OptionalBase<T>(in_place, il, std::forward<Args>(args)...) {}
+
+ // Forward value constructor. Similar to converting constructors,
+ // conditionally explicit.
+ template <
+ typename U = value_type,
+ std::enable_if_t<
+ std::is_constructible<T, U&&>::value &&
+ !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value &&
+ !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value &&
+ std::is_convertible<U&&, T>::value,
+ bool> = false>
+ constexpr Optional(U&& value)
+ : internal::OptionalBase<T>(in_place, std::forward<U>(value)) {}
+
+ template <
+ typename U = value_type,
+ std::enable_if_t<
+ std::is_constructible<T, U&&>::value &&
+ !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value &&
+ !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value &&
+ !std::is_convertible<U&&, T>::value,
+ bool> = false>
+ constexpr explicit Optional(U&& value)
+ : internal::OptionalBase<T>(in_place, std::forward<U>(value)) {}
+
+ ~Optional() = default;
+
+ // Defer copy-/move- assign operator implementation to OptionalBase.
+ Optional& operator=(const Optional& other) = default;
+ Optional& operator=(Optional&& other) noexcept(
+ std::is_nothrow_move_assignable<T>::value&&
+ std::is_nothrow_move_constructible<T>::value) = default;
+
+ Optional& operator=(nullopt_t) {
+ FreeIfNeeded();
+ return *this;
+ }
+
+ // Perfect-forwarded assignment.
+ template <typename U>
+ std::enable_if_t<
+ !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value &&
+ std::is_constructible<T, U>::value &&
+ std::is_assignable<T&, U>::value &&
+ (!std::is_scalar<T>::value ||
+ !std::is_same<std::decay_t<U>, T>::value),
+ Optional&>
+ operator=(U&& value) {
+ InitOrAssign(std::forward<U>(value));
+ return *this;
+ }
+
+ // Copy assign the state of other.
+ template <typename U>
+ std::enable_if_t<!internal::IsAssignableFromOptional<T, U>::value &&
+ std::is_constructible<T, const U&>::value &&
+ std::is_assignable<T&, const U&>::value,
+ Optional&>
+ operator=(const Optional<U>& other) {
+ CopyAssign(other);
+ return *this;
+ }
+
+ // Move assign the state of other.
+ template <typename U>
+ std::enable_if_t<!internal::IsAssignableFromOptional<T, U>::value &&
+ std::is_constructible<T, U>::value &&
+ std::is_assignable<T&, U>::value,
+ Optional&>
+ operator=(Optional<U>&& other) {
+ MoveAssign(std::move(other));
+ return *this;
+ }
+
+ constexpr const T* operator->() const {
+ GURL_CHECK(storage_.is_populated_);
+ return &storage_.value_;
+ }
+
+ constexpr T* operator->() {
+ GURL_CHECK(storage_.is_populated_);
+ return &storage_.value_;
+ }
+
+ constexpr const T& operator*() const & {
+ GURL_CHECK(storage_.is_populated_);
+ return storage_.value_;
+ }
+
+ constexpr T& operator*() & {
+ GURL_CHECK(storage_.is_populated_);
+ return storage_.value_;
+ }
+
+ constexpr const T&& operator*() const && {
+ GURL_CHECK(storage_.is_populated_);
+ return std::move(storage_.value_);
+ }
+
+ constexpr T&& operator*() && {
+ GURL_CHECK(storage_.is_populated_);
+ return std::move(storage_.value_);
+ }
+
+ constexpr explicit operator bool() const { return storage_.is_populated_; }
+
+ constexpr bool has_value() const { return storage_.is_populated_; }
+
+ constexpr T& value() & {
+ GURL_CHECK(storage_.is_populated_);
+ return storage_.value_;
+ }
+
+ constexpr const T& value() const & {
+ GURL_CHECK(storage_.is_populated_);
+ return storage_.value_;
+ }
+
+ constexpr T&& value() && {
+ GURL_CHECK(storage_.is_populated_);
+ return std::move(storage_.value_);
+ }
+
+ constexpr const T&& value() const && {
+ GURL_CHECK(storage_.is_populated_);
+ return std::move(storage_.value_);
+ }
+
+ template <class U>
+ constexpr T value_or(U&& default_value) const& {
+ // TODO(mlamouri): add the following assert when possible:
+ // static_assert(std::is_copy_constructible<T>::value,
+ // "T must be copy constructible");
+ static_assert(std::is_convertible<U, T>::value,
+ "U must be convertible to T");
+ return storage_.is_populated_
+ ? storage_.value_
+ : static_cast<T>(std::forward<U>(default_value));
+ }
+
+ template <class U>
+ constexpr T value_or(U&& default_value) && {
+ // TODO(mlamouri): add the following assert when possible:
+ // static_assert(std::is_move_constructible<T>::value,
+ // "T must be move constructible");
+ static_assert(std::is_convertible<U, T>::value,
+ "U must be convertible to T");
+ return storage_.is_populated_
+ ? std::move(storage_.value_)
+ : static_cast<T>(std::forward<U>(default_value));
+ }
+
+ void swap(Optional& other) {
+ if (!storage_.is_populated_ && !other.storage_.is_populated_)
+ return;
+
+ if (storage_.is_populated_ != other.storage_.is_populated_) {
+ if (storage_.is_populated_) {
+ other.storage_.Init(std::move(storage_.value_));
+ FreeIfNeeded();
+ } else {
+ storage_.Init(std::move(other.storage_.value_));
+ other.FreeIfNeeded();
+ }
+ return;
+ }
+
+ GURL_DCHECK(storage_.is_populated_ && other.storage_.is_populated_);
+ using std::swap;
+ swap(**this, *other);
+ }
+
+ void reset() { FreeIfNeeded(); }
+
+ template <class... Args>
+ T& emplace(Args&&... args) {
+ FreeIfNeeded();
+ storage_.Init(std::forward<Args>(args)...);
+ return storage_.value_;
+ }
+
+ template <class U, class... Args>
+ std::enable_if_t<
+ std::is_constructible<T, std::initializer_list<U>&, Args&&...>::value,
+ T&>
+ emplace(std::initializer_list<U> il, Args&&... args) {
+ FreeIfNeeded();
+ storage_.Init(il, std::forward<Args>(args)...);
+ return storage_.value_;
+ }
+
+ private:
+ // Accessing template base class's protected member needs explicit
+ // declaration to do so.
+ using internal::OptionalBase<T>::CopyAssign;
+ using internal::OptionalBase<T>::FreeIfNeeded;
+ using internal::OptionalBase<T>::InitOrAssign;
+ using internal::OptionalBase<T>::MoveAssign;
+ using internal::OptionalBase<T>::storage_;
+};
+
+// Here after defines comparation operators. The definition follows
+// http://en.cppreference.com/w/cpp/utility/optional/operator_cmp
+// while bool() casting is replaced by has_value() to meet the chromium
+// style guide.
+template <class T, class U>
+constexpr bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (lhs.has_value() != rhs.has_value())
+ return false;
+ if (!lhs.has_value())
+ return true;
+ return *lhs == *rhs;
+}
+
+template <class T, class U>
+constexpr bool operator!=(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (lhs.has_value() != rhs.has_value())
+ return true;
+ if (!lhs.has_value())
+ return false;
+ return *lhs != *rhs;
+}
+
+template <class T, class U>
+constexpr bool operator<(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (!rhs.has_value())
+ return false;
+ if (!lhs.has_value())
+ return true;
+ return *lhs < *rhs;
+}
+
+template <class T, class U>
+constexpr bool operator<=(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (!lhs.has_value())
+ return true;
+ if (!rhs.has_value())
+ return false;
+ return *lhs <= *rhs;
+}
+
+template <class T, class U>
+constexpr bool operator>(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (!lhs.has_value())
+ return false;
+ if (!rhs.has_value())
+ return true;
+ return *lhs > *rhs;
+}
+
+template <class T, class U>
+constexpr bool operator>=(const Optional<T>& lhs, const Optional<U>& rhs) {
+ if (!rhs.has_value())
+ return true;
+ if (!lhs.has_value())
+ return false;
+ return *lhs >= *rhs;
+}
+
+template <class T>
+constexpr bool operator==(const Optional<T>& opt, nullopt_t) {
+ return !opt;
+}
+
+template <class T>
+constexpr bool operator==(nullopt_t, const Optional<T>& opt) {
+ return !opt;
+}
+
+template <class T>
+constexpr bool operator!=(const Optional<T>& opt, nullopt_t) {
+ return opt.has_value();
+}
+
+template <class T>
+constexpr bool operator!=(nullopt_t, const Optional<T>& opt) {
+ return opt.has_value();
+}
+
+template <class T>
+constexpr bool operator<(const Optional<T>& opt, nullopt_t) {
+ return false;
+}
+
+template <class T>
+constexpr bool operator<(nullopt_t, const Optional<T>& opt) {
+ return opt.has_value();
+}
+
+template <class T>
+constexpr bool operator<=(const Optional<T>& opt, nullopt_t) {
+ return !opt;
+}
+
+template <class T>
+constexpr bool operator<=(nullopt_t, const Optional<T>& opt) {
+ return true;
+}
+
+template <class T>
+constexpr bool operator>(const Optional<T>& opt, nullopt_t) {
+ return opt.has_value();
+}
+
+template <class T>
+constexpr bool operator>(nullopt_t, const Optional<T>& opt) {
+ return false;
+}
+
+template <class T>
+constexpr bool operator>=(const Optional<T>& opt, nullopt_t) {
+ return true;
+}
+
+template <class T>
+constexpr bool operator>=(nullopt_t, const Optional<T>& opt) {
+ return !opt;
+}
+
+template <class T, class U>
+constexpr bool operator==(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt == value : false;
+}
+
+template <class T, class U>
+constexpr bool operator==(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value == *opt : false;
+}
+
+template <class T, class U>
+constexpr bool operator!=(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt != value : true;
+}
+
+template <class T, class U>
+constexpr bool operator!=(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value != *opt : true;
+}
+
+template <class T, class U>
+constexpr bool operator<(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt < value : true;
+}
+
+template <class T, class U>
+constexpr bool operator<(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value < *opt : false;
+}
+
+template <class T, class U>
+constexpr bool operator<=(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt <= value : true;
+}
+
+template <class T, class U>
+constexpr bool operator<=(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value <= *opt : false;
+}
+
+template <class T, class U>
+constexpr bool operator>(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt > value : false;
+}
+
+template <class T, class U>
+constexpr bool operator>(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value > *opt : true;
+}
+
+template <class T, class U>
+constexpr bool operator>=(const Optional<T>& opt, const U& value) {
+ return opt.has_value() ? *opt >= value : false;
+}
+
+template <class T, class U>
+constexpr bool operator>=(const U& value, const Optional<T>& opt) {
+ return opt.has_value() ? value >= *opt : true;
+}
+
+template <class T>
+constexpr Optional<std::decay_t<T>> make_optional(T&& value) {
+ return Optional<std::decay_t<T>>(std::forward<T>(value));
+}
+
+template <class T, class... Args>
+constexpr Optional<T> make_optional(Args&&... args) {
+ return Optional<T>(in_place, std::forward<Args>(args)...);
+}
+
+template <class T, class U, class... Args>
+constexpr Optional<T> make_optional(std::initializer_list<U> il,
+ Args&&... args) {
+ return Optional<T>(in_place, il, std::forward<Args>(args)...);
+}
+
+// Partial specialization for a function template is not allowed. Also, it is
+// not allowed to add overload function to std namespace, while it is allowed
+// to specialize the template in std. Thus, swap() (kind of) overloading is
+// defined in base namespace, instead.
+template <class T>
+std::enable_if_t<std::is_move_constructible<T>::value &&
+ internal::IsSwappable<T>::value>
+swap(Optional<T>& lhs, Optional<T>& rhs) {
+ lhs.swap(rhs);
+}
+
+} // namespace base
+
+namespace std {
+
+template <class T>
+struct hash<gurl_base::Optional<T>> {
+ size_t operator()(const gurl_base::Optional<T>& opt) const {
+ return opt == gurl_base::nullopt ? 0 : std::hash<T>()(*opt);
+ }
+};
+
+} // namespace std
+
+#endif // BASE_OPTIONAL_H_
diff --git a/base/stl_util.h b/base/stl_util.h
new file mode 100644
index 0000000..d6ca464
--- /dev/null
+++ b/base/stl_util.h
@@ -0,0 +1,657 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Derived from google3/util/gtl/stl_util.h
+
+#ifndef BASE_STL_UTIL_H_
+#define BASE_STL_UTIL_H_
+
+#include <algorithm>
+#include <deque>
+#include <forward_list>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "polyfills/base/logging.h"
+#include "base/optional.h"
+#include "base/template_util.h"
+
+namespace gurl_base {
+
+namespace internal {
+
+// Calls erase on iterators of matching elements.
+template <typename Container, typename Predicate>
+void IterateAndEraseIf(Container& container, Predicate pred) {
+ for (auto it = container.begin(); it != container.end();) {
+ if (pred(*it))
+ it = container.erase(it);
+ else
+ ++it;
+ }
+}
+
+template <typename Iter>
+constexpr bool IsRandomAccessIter =
+ std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+ std::random_access_iterator_tag>::value;
+
+// Utility type traits used for specializing gurl_base::Contains() below.
+template <typename Container, typename Element, typename = void>
+struct HasFindWithNpos : std::false_type {};
+
+template <typename Container, typename Element>
+struct HasFindWithNpos<
+ Container,
+ Element,
+ void_t<decltype(std::declval<const Container&>().find(
+ std::declval<const Element&>()) != Container::npos)>>
+ : std::true_type {};
+
+template <typename Container, typename Element, typename = void>
+struct HasFindWithEnd : std::false_type {};
+
+template <typename Container, typename Element>
+struct HasFindWithEnd<Container,
+ Element,
+ void_t<decltype(std::declval<const Container&>().find(
+ std::declval<const Element&>()) !=
+ std::declval<const Container&>().end())>>
+ : std::true_type {};
+
+template <typename Container, typename Element, typename = void>
+struct HasContains : std::false_type {};
+
+template <typename Container, typename Element>
+struct HasContains<Container,
+ Element,
+ void_t<decltype(std::declval<const Container&>().contains(
+ std::declval<const Element&>()))>> : std::true_type {};
+
+} // namespace internal
+
+// C++14 implementation of C++17's std::size():
+// http://en.cppreference.com/w/cpp/iterator/size
+template <typename Container>
+constexpr auto size(const Container& c) -> decltype(c.size()) {
+ return c.size();
+}
+
+template <typename T, size_t N>
+constexpr size_t size(const T (&array)[N]) noexcept {
+ return N;
+}
+
+// C++14 implementation of C++17's std::empty():
+// http://en.cppreference.com/w/cpp/iterator/empty
+template <typename Container>
+constexpr auto empty(const Container& c) -> decltype(c.empty()) {
+ return c.empty();
+}
+
+template <typename T, size_t N>
+constexpr bool empty(const T (&array)[N]) noexcept {
+ return false;
+}
+
+template <typename T>
+constexpr bool empty(std::initializer_list<T> il) noexcept {
+ return il.size() == 0;
+}
+
+// C++14 implementation of C++17's std::data():
+// http://en.cppreference.com/w/cpp/iterator/data
+template <typename Container>
+constexpr auto data(Container& c) -> decltype(c.data()) {
+ return c.data();
+}
+
+// std::basic_string::data() had no mutable overload prior to C++17 [1].
+// Hence this overload is provided.
+// Note: str[0] is safe even for empty strings, as they are guaranteed to be
+// null-terminated [2].
+//
+// [1] http://en.cppreference.com/w/cpp/string/basic_string/data
+// [2] http://en.cppreference.com/w/cpp/string/basic_string/operator_at
+template <typename CharT, typename Traits, typename Allocator>
+CharT* data(std::basic_string<CharT, Traits, Allocator>& str) {
+ return std::addressof(str[0]);
+}
+
+template <typename Container>
+constexpr auto data(const Container& c) -> decltype(c.data()) {
+ return c.data();
+}
+
+template <typename T, size_t N>
+constexpr T* data(T (&array)[N]) noexcept {
+ return array;
+}
+
+template <typename T>
+constexpr const T* data(std::initializer_list<T> il) noexcept {
+ return il.begin();
+}
+
+// Returns a const reference to the underlying container of a container adapter.
+// Works for std::priority_queue, std::queue, and std::stack.
+template <class A>
+const typename A::container_type& GetUnderlyingContainer(const A& adapter) {
+ struct ExposedAdapter : A {
+ using A::c;
+ };
+ return adapter.*&ExposedAdapter::c;
+}
+
+// Clears internal memory of an STL object.
+// STL clear()/reserve(0) does not always free internal memory allocated
+// This function uses swap/destructor to ensure the internal memory is freed.
+template<class T>
+void STLClearObject(T* obj) {
+ T tmp;
+ tmp.swap(*obj);
+ // Sometimes "T tmp" allocates objects with memory (arena implementation?).
+ // Hence using additional reserve(0) even if it doesn't always work.
+ obj->reserve(0);
+}
+
+// Counts the number of instances of val in a container.
+template <typename Container, typename T>
+typename std::iterator_traits<
+ typename Container::const_iterator>::difference_type
+STLCount(const Container& container, const T& val) {
+ return std::count(container.begin(), container.end(), val);
+}
+
+// General purpose implementation to check if |container| contains |value|.
+template <typename Container,
+ typename Value,
+ std::enable_if_t<
+ !internal::HasFindWithNpos<Container, Value>::value &&
+ !internal::HasFindWithEnd<Container, Value>::value &&
+ !internal::HasContains<Container, Value>::value>* = nullptr>
+bool Contains(const Container& container, const Value& value) {
+ using std::begin;
+ using std::end;
+ return std::find(begin(container), end(container), value) != end(container);
+}
+
+// Specialized Contains() implementation for when |container| has a find()
+// member function and a static npos member, but no contains() member function.
+template <typename Container,
+ typename Value,
+ std::enable_if_t<internal::HasFindWithNpos<Container, Value>::value &&
+ !internal::HasContains<Container, Value>::value>* =
+ nullptr>
+bool Contains(const Container& container, const Value& value) {
+ return container.find(value) != Container::npos;
+}
+
+// Specialized Contains() implementation for when |container| has a find()
+// and end() member function, but no contains() member function.
+template <typename Container,
+ typename Value,
+ std::enable_if_t<internal::HasFindWithEnd<Container, Value>::value &&
+ !internal::HasContains<Container, Value>::value>* =
+ nullptr>
+bool Contains(const Container& container, const Value& value) {
+ return container.find(value) != container.end();
+}
+
+// Specialized Contains() implementation for when |container| has a contains()
+// member function.
+template <
+ typename Container,
+ typename Value,
+ std::enable_if_t<internal::HasContains<Container, Value>::value>* = nullptr>
+bool Contains(const Container& container, const Value& value) {
+ return container.contains(value);
+}
+
+// O(1) implementation of const casting an iterator for any sequence,
+// associative or unordered associative container in the STL.
+//
+// Reference: https://stackoverflow.com/a/10669041
+template <typename Container,
+ typename ConstIter,
+ std::enable_if_t<!internal::IsRandomAccessIter<ConstIter>>* = nullptr>
+constexpr auto ConstCastIterator(Container& c, ConstIter it) {
+ return c.erase(it, it);
+}
+
+// Explicit overload for std::forward_list where erase() is named erase_after().
+template <typename T, typename Allocator>
+constexpr auto ConstCastIterator(
+ std::forward_list<T, Allocator>& c,
+ typename std::forward_list<T, Allocator>::const_iterator it) {
+// The erase_after(it, it) trick used below does not work for libstdc++ [1],
+// thus we need a different way.
+// TODO(crbug.com/972541): Remove this workaround once libstdc++ is fixed on all
+// platforms.
+//
+// [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90857
+#if defined(__GLIBCXX__)
+ return c.insert_after(it, {});
+#else
+ return c.erase_after(it, it);
+#endif
+}
+
+// Specialized O(1) const casting for random access iterators. This is
+// necessary, because erase() is either not available (e.g. array-like
+// containers), or has O(n) complexity (e.g. std::deque or std::vector).
+template <typename Container,
+ typename ConstIter,
+ std::enable_if_t<internal::IsRandomAccessIter<ConstIter>>* = nullptr>
+constexpr auto ConstCastIterator(Container& c, ConstIter it) {
+ using std::begin;
+ using std::cbegin;
+ return begin(c) + (it - cbegin(c));
+}
+
+namespace internal {
+
+template <typename Map, typename Key, typename Value>
+std::pair<typename Map::iterator, bool> InsertOrAssignImpl(Map& map,
+ Key&& key,
+ Value&& value) {
+ auto lower = map.lower_bound(key);
+ if (lower != map.end() && !map.key_comp()(key, lower->first)) {
+ // key already exists, perform assignment.
+ lower->second = std::forward<Value>(value);
+ return {lower, false};
+ }
+
+ // key did not yet exist, insert it.
+ return {map.emplace_hint(lower, std::forward<Key>(key),
+ std::forward<Value>(value)),
+ true};
+}
+
+template <typename Map, typename Key, typename Value>
+typename Map::iterator InsertOrAssignImpl(Map& map,
+ typename Map::const_iterator hint,
+ Key&& key,
+ Value&& value) {
+ auto&& key_comp = map.key_comp();
+ if ((hint == map.begin() || key_comp(std::prev(hint)->first, key))) {
+ if (hint == map.end() || key_comp(key, hint->first)) {
+ // *(hint - 1) < key < *hint => key did not exist and hint is correct.
+ return map.emplace_hint(hint, std::forward<Key>(key),
+ std::forward<Value>(value));
+ }
+
+ if (!key_comp(hint->first, key)) {
+ // key == *hint => key already exists and hint is correct.
+ auto mutable_hint = ConstCastIterator(map, hint);
+ mutable_hint->second = std::forward<Value>(value);
+ return mutable_hint;
+ }
+ }
+
+ // hint was not helpful, dispatch to hintless version.
+ return InsertOrAssignImpl(map, std::forward<Key>(key),
+ std::forward<Value>(value))
+ .first;
+}
+
+template <typename Map, typename Key, typename... Args>
+std::pair<typename Map::iterator, bool> TryEmplaceImpl(Map& map,
+ Key&& key,
+ Args&&... args) {
+ auto lower = map.lower_bound(key);
+ if (lower != map.end() && !map.key_comp()(key, lower->first)) {
+ // key already exists, do nothing.
+ return {lower, false};
+ }
+
+ // key did not yet exist, insert it.
+ return {map.emplace_hint(lower, std::piecewise_construct,
+ std::forward_as_tuple(std::forward<Key>(key)),
+ std::forward_as_tuple(std::forward<Args>(args)...)),
+ true};
+}
+
+template <typename Map, typename Key, typename... Args>
+typename Map::iterator TryEmplaceImpl(Map& map,
+ typename Map::const_iterator hint,
+ Key&& key,
+ Args&&... args) {
+ auto&& key_comp = map.key_comp();
+ if ((hint == map.begin() || key_comp(std::prev(hint)->first, key))) {
+ if (hint == map.end() || key_comp(key, hint->first)) {
+ // *(hint - 1) < key < *hint => key did not exist and hint is correct.
+ return map.emplace_hint(
+ hint, std::piecewise_construct,
+ std::forward_as_tuple(std::forward<Key>(key)),
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+
+ if (!key_comp(hint->first, key)) {
+ // key == *hint => no-op, return correct hint.
+ return ConstCastIterator(map, hint);
+ }
+ }
+
+ // hint was not helpful, dispatch to hintless version.
+ return TryEmplaceImpl(map, std::forward<Key>(key),
+ std::forward<Args>(args)...)
+ .first;
+}
+
+} // namespace internal
+
+// Implementation of C++17's std::map::insert_or_assign as a free function.
+template <typename Map, typename Value>
+std::pair<typename Map::iterator, bool>
+InsertOrAssign(Map& map, const typename Map::key_type& key, Value&& value) {
+ return internal::InsertOrAssignImpl(map, key, std::forward<Value>(value));
+}
+
+template <typename Map, typename Value>
+std::pair<typename Map::iterator, bool>
+InsertOrAssign(Map& map, typename Map::key_type&& key, Value&& value) {
+ return internal::InsertOrAssignImpl(map, std::move(key),
+ std::forward<Value>(value));
+}
+
+// Implementation of C++17's std::map::insert_or_assign with hint as a free
+// function.
+template <typename Map, typename Value>
+typename Map::iterator InsertOrAssign(Map& map,
+ typename Map::const_iterator hint,
+ const typename Map::key_type& key,
+ Value&& value) {
+ return internal::InsertOrAssignImpl(map, hint, key,
+ std::forward<Value>(value));
+}
+
+template <typename Map, typename Value>
+typename Map::iterator InsertOrAssign(Map& map,
+ typename Map::const_iterator hint,
+ typename Map::key_type&& key,
+ Value&& value) {
+ return internal::InsertOrAssignImpl(map, hint, std::move(key),
+ std::forward<Value>(value));
+}
+
+// Implementation of C++17's std::map::try_emplace as a free function.
+template <typename Map, typename... Args>
+std::pair<typename Map::iterator, bool>
+TryEmplace(Map& map, const typename Map::key_type& key, Args&&... args) {
+ return internal::TryEmplaceImpl(map, key, std::forward<Args>(args)...);
+}
+
+template <typename Map, typename... Args>
+std::pair<typename Map::iterator, bool> TryEmplace(Map& map,
+ typename Map::key_type&& key,
+ Args&&... args) {
+ return internal::TryEmplaceImpl(map, std::move(key),
+ std::forward<Args>(args)...);
+}
+
+// Implementation of C++17's std::map::try_emplace with hint as a free
+// function.
+template <typename Map, typename... Args>
+typename Map::iterator TryEmplace(Map& map,
+ typename Map::const_iterator hint,
+ const typename Map::key_type& key,
+ Args&&... args) {
+ return internal::TryEmplaceImpl(map, hint, key, std::forward<Args>(args)...);
+}
+
+template <typename Map, typename... Args>
+typename Map::iterator TryEmplace(Map& map,
+ typename Map::const_iterator hint,
+ typename Map::key_type&& key,
+ Args&&... args) {
+ return internal::TryEmplaceImpl(map, hint, std::move(key),
+ std::forward<Args>(args)...);
+}
+
+// Returns true if the container is sorted.
+template <typename Container>
+bool STLIsSorted(const Container& cont) {
+ return std::is_sorted(std::begin(cont), std::end(cont));
+}
+
+// Returns a new ResultType containing the difference of two sorted containers.
+template <typename ResultType, typename Arg1, typename Arg2>
+ResultType STLSetDifference(const Arg1& a1, const Arg2& a2) {
+ GURL_DCHECK(STLIsSorted(a1));
+ GURL_DCHECK(STLIsSorted(a2));
+ ResultType difference;
+ std::set_difference(a1.begin(), a1.end(),
+ a2.begin(), a2.end(),
+ std::inserter(difference, difference.end()));
+ return difference;
+}
+
+// Returns a new ResultType containing the union of two sorted containers.
+template <typename ResultType, typename Arg1, typename Arg2>
+ResultType STLSetUnion(const Arg1& a1, const Arg2& a2) {
+ GURL_DCHECK(STLIsSorted(a1));
+ GURL_DCHECK(STLIsSorted(a2));
+ ResultType result;
+ std::set_union(a1.begin(), a1.end(),
+ a2.begin(), a2.end(),
+ std::inserter(result, result.end()));
+ return result;
+}
+
+// Returns a new ResultType containing the intersection of two sorted
+// containers.
+template <typename ResultType, typename Arg1, typename Arg2>
+ResultType STLSetIntersection(const Arg1& a1, const Arg2& a2) {
+ GURL_DCHECK(STLIsSorted(a1));
+ GURL_DCHECK(STLIsSorted(a2));
+ ResultType result;
+ std::set_intersection(a1.begin(), a1.end(),
+ a2.begin(), a2.end(),
+ std::inserter(result, result.end()));
+ return result;
+}
+
+// Returns true if the sorted container |a1| contains all elements of the sorted
+// container |a2|.
+template <typename Arg1, typename Arg2>
+bool STLIncludes(const Arg1& a1, const Arg2& a2) {
+ GURL_DCHECK(STLIsSorted(a1));
+ GURL_DCHECK(STLIsSorted(a2));
+ return std::includes(a1.begin(), a1.end(),
+ a2.begin(), a2.end());
+}
+
+// Erase/EraseIf are based on library fundamentals ts v2 erase/erase_if
+// http://en.cppreference.com/w/cpp/experimental/lib_extensions_2
+// They provide a generic way to erase elements from a container.
+// The functions here implement these for the standard containers until those
+// functions are available in the C++ standard.
+// For Chromium containers overloads should be defined in their own headers
+// (like standard containers).
+// Note: there is no std::erase for standard associative containers so we don't
+// have it either.
+
+template <typename CharT, typename Traits, typename Allocator, typename Value>
+void Erase(std::basic_string<CharT, Traits, Allocator>& container,
+ const Value& value) {
+ container.erase(std::remove(container.begin(), container.end(), value),
+ container.end());
+}
+
+template <typename CharT, typename Traits, typename Allocator, class Predicate>
+void EraseIf(std::basic_string<CharT, Traits, Allocator>& container,
+ Predicate pred) {
+ container.erase(std::remove_if(container.begin(), container.end(), pred),
+ container.end());
+}
+
+template <class T, class Allocator, class Value>
+void Erase(std::deque<T, Allocator>& container, const Value& value) {
+ container.erase(std::remove(container.begin(), container.end(), value),
+ container.end());
+}
+
+template <class T, class Allocator, class Predicate>
+void EraseIf(std::deque<T, Allocator>& container, Predicate pred) {
+ container.erase(std::remove_if(container.begin(), container.end(), pred),
+ container.end());
+}
+
+template <class T, class Allocator, class Value>
+void Erase(std::vector<T, Allocator>& container, const Value& value) {
+ container.erase(std::remove(container.begin(), container.end(), value),
+ container.end());
+}
+
+template <class T, class Allocator, class Predicate>
+void EraseIf(std::vector<T, Allocator>& container, Predicate pred) {
+ container.erase(std::remove_if(container.begin(), container.end(), pred),
+ container.end());
+}
+
+template <class T, class Allocator, class Value>
+void Erase(std::forward_list<T, Allocator>& container, const Value& value) {
+ // Unlike std::forward_list::remove, this function template accepts
+ // heterogeneous types and does not force a conversion to the container's
+ // value type before invoking the == operator.
+ container.remove_if([&](const T& cur) { return cur == value; });
+}
+
+template <class T, class Allocator, class Predicate>
+void EraseIf(std::forward_list<T, Allocator>& container, Predicate pred) {
+ container.remove_if(pred);
+}
+
+template <class T, class Allocator, class Value>
+void Erase(std::list<T, Allocator>& container, const Value& value) {
+ // Unlike std::list::remove, this function template accepts heterogeneous
+ // types and does not force a conversion to the container's value type before
+ // invoking the == operator.
+ container.remove_if([&](const T& cur) { return cur == value; });
+}
+
+template <class T, class Allocator, class Predicate>
+void EraseIf(std::list<T, Allocator>& container, Predicate pred) {
+ container.remove_if(pred);
+}
+
+template <class Key, class T, class Compare, class Allocator, class Predicate>
+void EraseIf(std::map<Key, T, Compare, Allocator>& container, Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key, class T, class Compare, class Allocator, class Predicate>
+void EraseIf(std::multimap<Key, T, Compare, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key, class Compare, class Allocator, class Predicate>
+void EraseIf(std::set<Key, Compare, Allocator>& container, Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key, class Compare, class Allocator, class Predicate>
+void EraseIf(std::multiset<Key, Compare, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key,
+ class T,
+ class Hash,
+ class KeyEqual,
+ class Allocator,
+ class Predicate>
+void EraseIf(std::unordered_map<Key, T, Hash, KeyEqual, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key,
+ class T,
+ class Hash,
+ class KeyEqual,
+ class Allocator,
+ class Predicate>
+void EraseIf(
+ std::unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key,
+ class Hash,
+ class KeyEqual,
+ class Allocator,
+ class Predicate>
+void EraseIf(std::unordered_set<Key, Hash, KeyEqual, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+template <class Key,
+ class Hash,
+ class KeyEqual,
+ class Allocator,
+ class Predicate>
+void EraseIf(std::unordered_multiset<Key, Hash, KeyEqual, Allocator>& container,
+ Predicate pred) {
+ internal::IterateAndEraseIf(container, pred);
+}
+
+// A helper class to be used as the predicate with |EraseIf| to implement
+// in-place set intersection. Helps implement the algorithm of going through
+// each container an element at a time, erasing elements from the first
+// container if they aren't in the second container. Requires each container be
+// sorted. Note that the logic below appears inverted since it is returning
+// whether an element should be erased.
+template <class Collection>
+class IsNotIn {
+ public:
+ explicit IsNotIn(const Collection& collection)
+ : i_(collection.begin()), end_(collection.end()) {}
+
+ bool operator()(const typename Collection::value_type& x) {
+ while (i_ != end_ && *i_ < x)
+ ++i_;
+ if (i_ == end_)
+ return true;
+ if (*i_ == x) {
+ ++i_;
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ typename Collection::const_iterator i_;
+ const typename Collection::const_iterator end_;
+};
+
+// Helper for returning the optional value's address, or nullptr.
+template <class T>
+T* OptionalOrNullptr(gurl_base::Optional<T>& optional) {
+ return optional.has_value() ? &optional.value() : nullptr;
+}
+
+template <class T>
+const T* OptionalOrNullptr(const gurl_base::Optional<T>& optional) {
+ return optional.has_value() ? &optional.value() : nullptr;
+}
+
+} // namespace base
+
+#endif // BASE_STL_UTIL_H_
diff --git a/base/strings/BUILD b/base/strings/BUILD
new file mode 100644
index 0000000..60aebcf
--- /dev/null
+++ b/base/strings/BUILD
@@ -0,0 +1,32 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "strings",
+ srcs = [
+ "string16.cc",
+ "string_piece.cc",
+ "string_util.cc",
+ "string_util_constants.cc",
+ "utf_string_conversion_utils.cc",
+ "utf_string_conversions.cc",
+ ],
+ hdrs = [
+ "char_traits.h",
+ "string16.h",
+ "string_piece.h",
+ "string_piece_forward.h",
+ "string_util.h",
+ "string_util_posix.h",
+ "utf_string_conversion_utils.h",
+ "utf_string_conversions.h",
+ ],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//base",
+ "//base/third_party/icu",
+ "//build:build_config",
+ "//polyfills",
+ ],
+)
diff --git a/base/strings/char_traits.h b/base/strings/char_traits.h
new file mode 100644
index 0000000..0fe9f26
--- /dev/null
+++ b/base/strings/char_traits.h
@@ -0,0 +1,92 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_CHAR_TRAITS_H_
+#define BASE_STRINGS_CHAR_TRAITS_H_
+
+#include <stddef.h>
+
+#include "base/compiler_specific.h"
+
+namespace gurl_base {
+
+// constexpr version of http://en.cppreference.com/w/cpp/string/char_traits.
+// This currently just implements the bits needed to support a (mostly)
+// constexpr StringPiece.
+//
+// TODO(dcheng): Once we switch to C++17, most methods will become constexpr and
+// we can switch over to using the one in the standard library.
+template <typename T>
+struct CharTraits {
+ // Performs a lexographical comparison of the first N characters of |s1| and
+ // |s2|. Returns 0 if equal, -1 if |s1| is less than |s2|, and 1 if |s1| is
+ // greater than |s2|.
+ static constexpr int compare(const T* s1, const T* s2, size_t n) noexcept;
+
+ // Returns the length of |s|, assuming null termination (and not including the
+ // terminating null).
+ static constexpr size_t length(const T* s) noexcept;
+};
+
+template <typename T>
+constexpr int CharTraits<T>::compare(const T* s1,
+ const T* s2,
+ size_t n) noexcept {
+ for (; n; --n, ++s1, ++s2) {
+ if (*s1 < *s2)
+ return -1;
+ if (*s1 > *s2)
+ return 1;
+ }
+ return 0;
+}
+
+template <typename T>
+constexpr size_t CharTraits<T>::length(const T* s) noexcept {
+ size_t i = 0;
+ for (; *s; ++s)
+ ++i;
+ return i;
+}
+
+// char specialization of CharTraits that can use clang's constexpr instrinsics,
+// where available.
+template <>
+struct CharTraits<char> {
+ static constexpr int compare(const char* s1,
+ const char* s2,
+ size_t n) noexcept;
+ static constexpr size_t length(const char* s) noexcept;
+};
+
+constexpr int CharTraits<char>::compare(const char* s1,
+ const char* s2,
+ size_t n) noexcept {
+#if HAS_FEATURE(cxx_constexpr_string_builtins)
+ return __builtin_memcmp(s1, s2, n);
+#else
+ for (; n; --n, ++s1, ++s2) {
+ if (*s1 < *s2)
+ return -1;
+ if (*s1 > *s2)
+ return 1;
+ }
+ return 0;
+#endif
+}
+
+constexpr size_t CharTraits<char>::length(const char* s) noexcept {
+#if defined(__clang__)
+ return __builtin_strlen(s);
+#else
+ size_t i = 0;
+ for (; *s; ++s)
+ ++i;
+ return i;
+#endif
+}
+
+} // namespace base
+
+#endif // BASE_STRINGS_CHAR_TRAITS_H_
diff --git a/base/strings/char_traits_unittest.cc b/base/strings/char_traits_unittest.cc
new file mode 100644
index 0000000..d0fdc07
--- /dev/null
+++ b/base/strings/char_traits_unittest.cc
@@ -0,0 +1,32 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/char_traits.h"
+#include "base/strings/string16.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+TEST(CharTraitsTest, CharCompare) {
+ static_assert(CharTraits<char>::compare("abc", "def", 3) == -1, "");
+ static_assert(CharTraits<char>::compare("def", "def", 3) == 0, "");
+ static_assert(CharTraits<char>::compare("ghi", "def", 3) == 1, "");
+}
+
+TEST(CharTraitsTest, CharLength) {
+ static_assert(CharTraits<char>::length("") == 0, "");
+ static_assert(CharTraits<char>::length("abc") == 3, "");
+}
+
+TEST(CharTraitsTest, Char16TCompare) {
+ static_assert(CharTraits<char16_t>::compare(u"abc", u"def", 3) == -1, "");
+ static_assert(CharTraits<char16_t>::compare(u"def", u"def", 3) == 0, "");
+ static_assert(CharTraits<char16_t>::compare(u"ghi", u"def", 3) == 1, "");
+}
+
+TEST(CharTraitsTest, Char16TLength) {
+ static_assert(CharTraits<char16_t>::length(u"abc") == 3, "");
+}
+
+} // namespace base
diff --git a/base/strings/latin1_string_conversions.cc b/base/strings/latin1_string_conversions.cc
new file mode 100644
index 0000000..5569015
--- /dev/null
+++ b/base/strings/latin1_string_conversions.cc
@@ -0,0 +1,19 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/latin1_string_conversions.h"
+
+namespace gurl_base {
+
+string16 Latin1OrUTF16ToUTF16(size_t length,
+ const Latin1Char* latin1,
+ const char16* utf16) {
+ if (!length)
+ return string16();
+ if (latin1)
+ return string16(latin1, latin1 + length);
+ return string16(utf16, utf16 + length);
+}
+
+} // namespace base
diff --git a/base/strings/latin1_string_conversions.h b/base/strings/latin1_string_conversions.h
new file mode 100644
index 0000000..3d60980
--- /dev/null
+++ b/base/strings/latin1_string_conversions.h
@@ -0,0 +1,34 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_
+
+#include <stddef.h>
+
+#include <string>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+
+namespace gurl_base {
+
+// This definition of Latin1Char matches the definition of LChar in Blink. We
+// use unsigned char rather than char to make less tempting to mix and match
+// Latin-1 and UTF-8 characters..
+typedef unsigned char Latin1Char;
+
+// This somewhat odd function is designed to help us convert from Blink Strings
+// to string16. A Blink string is either backed by an array of Latin-1
+// characters or an array of UTF-16 characters. This function is called by
+// WebString::operator string16() to convert one or the other character array
+// to string16. This function is defined here rather than in WebString.h to
+// avoid binary bloat in all the callers of the conversion operator.
+BASE_EXPORT string16 Latin1OrUTF16ToUTF16(size_t length,
+ const Latin1Char* latin1,
+ const char16* utf16);
+
+} // namespace base
+
+#endif // BASE_STRINGS_LATIN1_STRING_CONVERSIONS_H_
diff --git a/base/strings/nullable_string16.cc b/base/strings/nullable_string16.cc
new file mode 100644
index 0000000..618800d
--- /dev/null
+++ b/base/strings/nullable_string16.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/nullable_string16.h"
+
+#include <ostream>
+#include <utility>
+
+namespace gurl_base {
+NullableString16::NullableString16() = default;
+NullableString16::NullableString16(const NullableString16& other) = default;
+NullableString16::NullableString16(NullableString16&& other) = default;
+
+NullableString16::NullableString16(const string16& string, bool is_null) {
+ if (!is_null)
+ string_.emplace(string);
+}
+
+NullableString16::NullableString16(Optional<string16> optional_string16)
+ : string_(std::move(optional_string16)) {}
+
+NullableString16::~NullableString16() = default;
+NullableString16& NullableString16::operator=(const NullableString16& other) =
+ default;
+NullableString16& NullableString16::operator=(NullableString16&& other) =
+ default;
+
+std::ostream& operator<<(std::ostream& out, const NullableString16& value) {
+ return value.is_null() ? out << "(null)" : out << value.string();
+}
+
+} // namespace base
diff --git a/base/strings/nullable_string16.h b/base/strings/nullable_string16.h
new file mode 100644
index 0000000..f2ca7bd
--- /dev/null
+++ b/base/strings/nullable_string16.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_NULLABLE_STRING16_H_
+#define BASE_STRINGS_NULLABLE_STRING16_H_
+
+#include <iosfwd>
+
+#include "polyfills/base/base_export.h"
+#include "base/optional.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_util.h"
+
+namespace gurl_base {
+
+// This class is a simple wrapper for string16 which also contains a null
+// state. This should be used only where the difference between null and
+// empty is meaningful.
+class BASE_EXPORT NullableString16 {
+ public:
+ NullableString16();
+ NullableString16(const NullableString16& other);
+ NullableString16(NullableString16&& other);
+ NullableString16(const string16& string, bool is_null);
+ explicit NullableString16(Optional<string16> optional_string16);
+ ~NullableString16();
+
+ NullableString16& operator=(const NullableString16& other);
+ NullableString16& operator=(NullableString16&& other);
+
+ const string16& string() const {
+ return string_ ? *string_ : EmptyString16();
+ }
+ bool is_null() const { return !string_; }
+ const Optional<string16>& as_optional_string16() const { return string_; }
+
+ private:
+ Optional<string16> string_;
+};
+
+inline bool operator==(const NullableString16& a, const NullableString16& b) {
+ return a.as_optional_string16() == b.as_optional_string16();
+}
+
+inline bool operator!=(const NullableString16& a, const NullableString16& b) {
+ return !(a == b);
+}
+
+BASE_EXPORT std::ostream& operator<<(std::ostream& out,
+ const NullableString16& value);
+
+} // namespace base
+
+#endif // BASE_STRINGS_NULLABLE_STRING16_H_
diff --git a/base/strings/nullable_string16_unittest.cc b/base/strings/nullable_string16_unittest.cc
new file mode 100644
index 0000000..e3d063f
--- /dev/null
+++ b/base/strings/nullable_string16_unittest.cc
@@ -0,0 +1,35 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/nullable_string16.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+TEST(NullableString16Test, DefaultConstructor) {
+ NullableString16 s;
+ EXPECT_TRUE(s.is_null());
+ EXPECT_EQ(string16(), s.string());
+}
+
+TEST(NullableString16Test, Equals) {
+ NullableString16 a(ASCIIToUTF16("hello"), false);
+ NullableString16 b(ASCIIToUTF16("hello"), false);
+ EXPECT_EQ(a, b);
+}
+
+TEST(NullableString16Test, NotEquals) {
+ NullableString16 a(ASCIIToUTF16("hello"), false);
+ NullableString16 b(ASCIIToUTF16("world"), false);
+ EXPECT_NE(a, b);
+}
+
+TEST(NullableString16Test, NotEqualsNull) {
+ NullableString16 a(ASCIIToUTF16("hello"), false);
+ NullableString16 b;
+ EXPECT_NE(a, b);
+}
+
+} // namespace base
diff --git a/base/strings/pattern.cc b/base/strings/pattern.cc
new file mode 100644
index 0000000..65ec075
--- /dev/null
+++ b/base/strings/pattern.cc
@@ -0,0 +1,155 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/pattern.h"
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace gurl_base {
+
+namespace {
+
+constexpr bool IsWildcard(base_icu::UChar32 character) {
+ return character == '*' || character == '?';
+}
+
+// Searches for the next subpattern of |pattern| in |string|, up to the given
+// |maximum_distance|. The subpattern extends from the start of |pattern| up to
+// the first wildcard character (or the end of the string). If the value of
+// |maximum_distance| is negative, the maximum distance is considered infinite.
+template <typename CHAR, typename NEXT>
+constexpr bool SearchForChars(const CHAR** pattern,
+ const CHAR* pattern_end,
+ const CHAR** string,
+ const CHAR* string_end,
+ int maximum_distance,
+ NEXT next) {
+ const CHAR* pattern_start = *pattern;
+ const CHAR* string_start = *string;
+ bool escape = false;
+ while (true) {
+ if (*pattern == pattern_end) {
+ // If this is the end of the pattern, only accept the end of the string;
+ // anything else falls through to the mismatch case.
+ if (*string == string_end)
+ return true;
+ } else {
+ // If we have found a wildcard, we're done.
+ if (!escape && IsWildcard(**pattern))
+ return true;
+
+ // Check if the escape character is found. If so, skip it and move to the
+ // next character.
+ if (!escape && **pattern == '\\') {
+ escape = true;
+ next(pattern, pattern_end);
+ continue;
+ }
+
+ escape = false;
+
+ if (*string == string_end)
+ return false;
+
+ // Check if the chars match, if so, increment the ptrs.
+ const CHAR* pattern_next = *pattern;
+ const CHAR* string_next = *string;
+ base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
+ if (pattern_char == next(&string_next, string_end) &&
+ pattern_char != CBU_SENTINEL) {
+ *pattern = pattern_next;
+ *string = string_next;
+ continue;
+ }
+ }
+
+ // Mismatch. If we have reached the maximum distance, return false,
+ // otherwise restart at the beginning of the pattern with the next character
+ // in the string.
+ // TODO(bauerb): This is a naive implementation of substring search, which
+ // could be implemented with a more efficient algorithm, e.g.
+ // Knuth-Morris-Pratt (at the expense of requiring preprocessing).
+ if (maximum_distance == 0)
+ return false;
+
+ // Because unlimited distance is represented as -1, this will never reach 0
+ // and therefore fail the match above.
+ maximum_distance--;
+ *pattern = pattern_start;
+ next(&string_start, string_end);
+ *string = string_start;
+ }
+}
+
+// Consumes consecutive wildcard characters (? or *). Returns the maximum number
+// of characters matched by the sequence of wildcards, or -1 if the wildcards
+// match an arbitrary number of characters (which is the case if it contains at
+// least one *).
+template <typename CHAR, typename NEXT>
+constexpr int EatWildcards(const CHAR** pattern, const CHAR* end, NEXT next) {
+ int num_question_marks = 0;
+ bool has_asterisk = false;
+ while (*pattern != end) {
+ if (**pattern == '?') {
+ num_question_marks++;
+ } else if (**pattern == '*') {
+ has_asterisk = true;
+ } else {
+ break;
+ }
+
+ next(pattern, end);
+ }
+ return has_asterisk ? -1 : num_question_marks;
+}
+
+template <typename CHAR, typename NEXT>
+constexpr bool MatchPatternT(const CHAR* eval,
+ const CHAR* eval_end,
+ const CHAR* pattern,
+ const CHAR* pattern_end,
+ NEXT next) {
+ do {
+ int maximum_wildcard_length = EatWildcards(&pattern, pattern_end, next);
+ if (!SearchForChars(&pattern, pattern_end, &eval, eval_end,
+ maximum_wildcard_length, next)) {
+ return false;
+ }
+ } while (pattern != pattern_end);
+ return true;
+}
+
+struct NextCharUTF8 {
+ base_icu::UChar32 operator()(const char** p, const char* end) {
+ base_icu::UChar32 c;
+ int offset = 0;
+ CBU8_NEXT(*p, offset, end - *p, c);
+ *p += offset;
+ return c;
+ }
+};
+
+struct NextCharUTF16 {
+ base_icu::UChar32 operator()(const char16** p, const char16* end) {
+ base_icu::UChar32 c;
+ int offset = 0;
+ CBU16_NEXT(*p, offset, end - *p, c);
+ *p += offset;
+ return c;
+ }
+};
+
+} // namespace
+
+bool MatchPattern(StringPiece eval, StringPiece pattern) {
+ return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(),
+ pattern.data() + pattern.size(), NextCharUTF8());
+}
+
+bool MatchPattern(StringPiece16 eval, StringPiece16 pattern) {
+ return MatchPatternT(eval.data(), eval.data() + eval.size(), pattern.data(),
+ pattern.data() + pattern.size(), NextCharUTF16());
+}
+
+} // namespace base
diff --git a/base/strings/pattern.h b/base/strings/pattern.h
new file mode 100644
index 0000000..3d280d0
--- /dev/null
+++ b/base/strings/pattern.h
@@ -0,0 +1,23 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_PATTERN_H_
+#define BASE_STRINGS_PATTERN_H_
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+// Returns true if the |string| passed in matches the |pattern|. The pattern
+// string can contain wildcards like * and ?.
+//
+// The backslash character (\) is an escape character for * and ?.
+// ? matches 0 or 1 character, while * matches 0 or more characters.
+BASE_EXPORT bool MatchPattern(StringPiece string, StringPiece pattern);
+BASE_EXPORT bool MatchPattern(StringPiece16 string, StringPiece16 pattern);
+
+} // namespace base
+
+#endif // BASE_STRINGS_PATTERN_H_
diff --git a/base/strings/pattern_unittest.cc b/base/strings/pattern_unittest.cc
new file mode 100644
index 0000000..540f784
--- /dev/null
+++ b/base/strings/pattern_unittest.cc
@@ -0,0 +1,52 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/pattern.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+TEST(StringUtilTest, MatchPatternTest) {
+ EXPECT_TRUE(MatchPattern("www.google.com", "*.com"));
+ EXPECT_TRUE(MatchPattern("www.google.com", "*"));
+ EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org"));
+ EXPECT_TRUE(MatchPattern("Hello", "H?l?o"));
+ EXPECT_FALSE(MatchPattern("www.google.com", "http://*)"));
+ EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM"));
+ EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*"));
+ EXPECT_FALSE(MatchPattern("", "*.*"));
+ EXPECT_TRUE(MatchPattern("", "*"));
+ EXPECT_TRUE(MatchPattern("", "?"));
+ EXPECT_TRUE(MatchPattern("", ""));
+ EXPECT_FALSE(MatchPattern("Hello", ""));
+ EXPECT_TRUE(MatchPattern("Hello*", "Hello*"));
+ EXPECT_TRUE(MatchPattern("abcd", "*???"));
+ EXPECT_FALSE(MatchPattern("abcd", "???"));
+ EXPECT_TRUE(MatchPattern("abcb", "a*b"));
+ EXPECT_FALSE(MatchPattern("abcb", "a?b"));
+
+ // Test UTF8 matching.
+ EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0"));
+ EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?."));
+ EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*"));
+ // Invalid sequences should be handled as a single invalid character.
+ EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?"));
+ // If the pattern has invalid characters, it shouldn't match anything.
+ EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80"));
+
+ // Test UTF16 character matching.
+ EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"),
+ UTF8ToUTF16("*.com")));
+ EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"),
+ UTF8ToUTF16("He??o\\*1*")));
+
+ // Some test cases that might cause naive implementations to exhibit
+ // exponential run time or fail.
+ EXPECT_TRUE(MatchPattern("Hello", "He********************************o"));
+ EXPECT_TRUE(MatchPattern("123456789012345678", "?????????????????*"));
+ EXPECT_TRUE(MatchPattern("aaaaaaaaaaab", "a*a*a*a*a*a*a*a*a*a*a*b"));
+}
+
+} // namespace base
diff --git a/base/strings/safe_sprintf.cc b/base/strings/safe_sprintf.cc
new file mode 100644
index 0000000..ab6e112
--- /dev/null
+++ b/base/strings/safe_sprintf.cc
@@ -0,0 +1,682 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/safe_sprintf.h"
+
+#include <errno.h>
+#include <string.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "base/macros.h"
+#include "build/build_config.h"
+
+#if !defined(NDEBUG)
+// In debug builds, we use RAW_CHECK() to print useful error messages, if
+// SafeSPrintf() is called with broken arguments.
+// As our contract promises that SafeSPrintf() can be called from any
+// restricted run-time context, it is not actually safe to call logging
+// functions from it; and we only ever do so for debug builds and hope for the
+// best. We should _never_ call any logging function other than RAW_CHECK(),
+// and we should _never_ include any logging code that is active in production
+// builds. Most notably, we should not include these logging functions in
+// unofficial release builds, even though those builds would otherwise have
+// DCHECKS() enabled.
+// In other words; please do not remove the #ifdef around this #include.
+// Instead, in production builds we opt for returning a degraded result,
+// whenever an error is encountered.
+// E.g. The broken function call
+// SafeSPrintf("errno = %d (%x)", errno, strerror(errno))
+// will print something like
+// errno = 13, (%x)
+// instead of
+// errno = 13 (Access denied)
+// In most of the anticipated use cases, that's probably the preferred
+// behavior.
+#include "polyfills/base/logging.h"
+#define DEBUG_CHECK RAW_CHECK
+#else
+#define DEBUG_CHECK(x) do { if (x) { } } while (0)
+#endif
+
+namespace gurl_base {
+namespace strings {
+
+// The code in this file is extremely careful to be async-signal-safe.
+//
+// Most obviously, we avoid calling any code that could dynamically allocate
+// memory. Doing so would almost certainly result in bugs and dead-locks.
+// We also avoid calling any other STL functions that could have unintended
+// side-effects involving memory allocation or access to other shared
+// resources.
+//
+// But on top of that, we also avoid calling other library functions, as many
+// of them have the side-effect of calling getenv() (in order to deal with
+// localization) or accessing errno. The latter sounds benign, but there are
+// several execution contexts where it isn't even possible to safely read let
+// alone write errno.
+//
+// The stated design goal of the SafeSPrintf() function is that it can be
+// called from any context that can safely call C or C++ code (i.e. anything
+// that doesn't require assembly code).
+//
+// For a brief overview of some but not all of the issues with async-signal-
+// safety, refer to:
+// http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html
+
+namespace {
+const size_t kSSizeMaxConst = ((size_t)(ssize_t)-1) >> 1;
+
+const char kUpCaseHexDigits[] = "0123456789ABCDEF";
+const char kDownCaseHexDigits[] = "0123456789abcdef";
+}
+
+#if defined(NDEBUG)
+// We would like to define kSSizeMax as std::numeric_limits<ssize_t>::max(),
+// but C++ doesn't allow us to do that for constants. Instead, we have to
+// use careful casting and shifting. We later use a static_assert to
+// verify that this worked correctly.
+namespace {
+const size_t kSSizeMax = kSSizeMaxConst;
+}
+#else // defined(NDEBUG)
+// For efficiency, we really need kSSizeMax to be a constant. But for unit
+// tests, it should be adjustable. This allows us to verify edge cases without
+// having to fill the entire available address space. As a compromise, we make
+// kSSizeMax adjustable in debug builds, and then only compile that particular
+// part of the unit test in debug builds.
+namespace {
+static size_t kSSizeMax = kSSizeMaxConst;
+}
+
+namespace internal {
+void SetSafeSPrintfSSizeMaxForTest(size_t max) {
+ kSSizeMax = max;
+}
+
+size_t GetSafeSPrintfSSizeMaxForTest() {
+ return kSSizeMax;
+}
+}
+#endif // defined(NDEBUG)
+
+namespace {
+class Buffer {
+ public:
+ // |buffer| is caller-allocated storage that SafeSPrintf() writes to. It
+ // has |size| bytes of writable storage. It is the caller's responsibility
+ // to ensure that the buffer is at least one byte in size, so that it fits
+ // the trailing NUL that will be added by the destructor. The buffer also
+ // must be smaller or equal to kSSizeMax in size.
+ Buffer(char* buffer, size_t size)
+ : buffer_(buffer),
+ size_(size - 1), // Account for trailing NUL byte
+ count_(0) {
+// MSVS2013's standard library doesn't mark max() as constexpr yet. cl.exe
+// supports static_cast but doesn't really implement constexpr yet so it doesn't
+// complain, but clang does.
+#if __cplusplus >= 201103 && !(defined(__clang__) && defined(OS_WIN))
+ static_assert(kSSizeMaxConst ==
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()),
+ "kSSizeMaxConst should be the max value of an ssize_t");
+#endif
+ DEBUG_CHECK(size > 0);
+ DEBUG_CHECK(size <= kSSizeMax);
+ }
+
+ ~Buffer() {
+ // The code calling the constructor guaranteed that there was enough space
+ // to store a trailing NUL -- and in debug builds, we are actually
+ // verifying this with DEBUG_CHECK()s in the constructor. So, we can
+ // always unconditionally write the NUL byte in the destructor. We do not
+ // need to adjust the count_, as SafeSPrintf() copies snprintf() in not
+ // including the NUL byte in its return code.
+ *GetInsertionPoint() = '\000';
+ }
+
+ // Returns true, iff the buffer is filled all the way to |kSSizeMax-1|. The
+ // caller can now stop adding more data, as GetCount() has reached its
+ // maximum possible value.
+ inline bool OutOfAddressableSpace() const {
+ return count_ == static_cast<size_t>(kSSizeMax - 1);
+ }
+
+ // Returns the number of bytes that would have been emitted to |buffer_|
+ // if it was sized sufficiently large. This number can be larger than
+ // |size_|, if the caller provided an insufficiently large output buffer.
+ // But it will never be bigger than |kSSizeMax-1|.
+ inline ssize_t GetCount() const {
+ DEBUG_CHECK(count_ < kSSizeMax);
+ return static_cast<ssize_t>(count_);
+ }
+
+ // Emits one |ch| character into the |buffer_| and updates the |count_| of
+ // characters that are currently supposed to be in the buffer.
+ // Returns "false", iff the buffer was already full.
+ // N.B. |count_| increases even if no characters have been written. This is
+ // needed so that GetCount() can return the number of bytes that should
+ // have been allocated for the |buffer_|.
+ inline bool Out(char ch) {
+ if (size_ >= 1 && count_ < size_) {
+ buffer_[count_] = ch;
+ return IncrementCountByOne();
+ }
+ // |count_| still needs to be updated, even if the buffer has been
+ // filled completely. This allows SafeSPrintf() to return the number of
+ // bytes that should have been emitted.
+ IncrementCountByOne();
+ return false;
+ }
+
+ // Inserts |padding|-|len| bytes worth of padding into the |buffer_|.
+ // |count_| will also be incremented by the number of bytes that were meant
+ // to be emitted. The |pad| character is typically either a ' ' space
+ // or a '0' zero, but other non-NUL values are legal.
+ // Returns "false", iff the the |buffer_| filled up (i.e. |count_|
+ // overflowed |size_|) at any time during padding.
+ inline bool Pad(char pad, size_t padding, size_t len) {
+ DEBUG_CHECK(pad);
+ DEBUG_CHECK(padding <= kSSizeMax);
+ for (; padding > len; --padding) {
+ if (!Out(pad)) {
+ if (--padding) {
+ IncrementCount(padding-len);
+ }
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // POSIX doesn't define any async-signal-safe function for converting
+ // an integer to ASCII. Define our own version.
+ //
+ // This also gives us the ability to make the function a little more
+ // powerful and have it deal with |padding|, with truncation, and with
+ // predicting the length of the untruncated output.
+ //
+ // IToASCII() converts an integer |i| to ASCII.
+ //
+ // Unlike similar functions in the standard C library, it never appends a
+ // NUL character. This is left for the caller to do.
+ //
+ // While the function signature takes a signed int64_t, the code decides at
+ // run-time whether to treat the argument as signed (int64_t) or as unsigned
+ // (uint64_t) based on the value of |sign|.
+ //
+ // It supports |base|s 2 through 16. Only a |base| of 10 is allowed to have
+ // a |sign|. Otherwise, |i| is treated as unsigned.
+ //
+ // For bases larger than 10, |upcase| decides whether lower-case or upper-
+ // case letters should be used to designate digits greater than 10.
+ //
+ // Padding can be done with either '0' zeros or ' ' spaces. Padding has to
+ // be positive and will always be applied to the left of the output.
+ //
+ // Prepends a |prefix| to the number (e.g. "0x"). This prefix goes to
+ // the left of |padding|, if |pad| is '0'; and to the right of |padding|
+ // if |pad| is ' '.
+ //
+ // Returns "false", if the |buffer_| overflowed at any time.
+ bool IToASCII(bool sign, bool upcase, int64_t i, int base,
+ char pad, size_t padding, const char* prefix);
+
+ private:
+ // Increments |count_| by |inc| unless this would cause |count_| to
+ // overflow |kSSizeMax-1|. Returns "false", iff an overflow was detected;
+ // it then clamps |count_| to |kSSizeMax-1|.
+ inline bool IncrementCount(size_t inc) {
+ // "inc" is either 1 or a "padding" value. Padding is clamped at
+ // run-time to at most kSSizeMax-1. So, we know that "inc" is always in
+ // the range 1..kSSizeMax-1.
+ // This allows us to compute "kSSizeMax - 1 - inc" without incurring any
+ // integer overflows.
+ DEBUG_CHECK(inc <= kSSizeMax - 1);
+ if (count_ > kSSizeMax - 1 - inc) {
+ count_ = kSSizeMax - 1;
+ return false;
+ }
+ count_ += inc;
+ return true;
+ }
+
+ // Convenience method for the common case of incrementing |count_| by one.
+ inline bool IncrementCountByOne() {
+ return IncrementCount(1);
+ }
+
+ // Return the current insertion point into the buffer. This is typically
+ // at |buffer_| + |count_|, but could be before that if truncation
+ // happened. It always points to one byte past the last byte that was
+ // successfully placed into the |buffer_|.
+ inline char* GetInsertionPoint() const {
+ size_t idx = count_;
+ if (idx > size_) {
+ idx = size_;
+ }
+ return buffer_ + idx;
+ }
+
+ // User-provided buffer that will receive the fully formatted output string.
+ char* buffer_;
+
+ // Number of bytes that are available in the buffer excluding the trailing
+ // NUL byte that will be added by the destructor.
+ const size_t size_;
+
+ // Number of bytes that would have been emitted to the buffer, if the buffer
+ // was sufficiently big. This number always excludes the trailing NUL byte
+ // and it is guaranteed to never grow bigger than kSSizeMax-1.
+ size_t count_;
+
+ DISALLOW_COPY_AND_ASSIGN(Buffer);
+};
+
+
+bool Buffer::IToASCII(bool sign, bool upcase, int64_t i, int base,
+ char pad, size_t padding, const char* prefix) {
+ // Sanity check for parameters. None of these should ever fail, but see
+ // above for the rationale why we can't call GURL_CHECK().
+ DEBUG_CHECK(base >= 2);
+ DEBUG_CHECK(base <= 16);
+ DEBUG_CHECK(!sign || base == 10);
+ DEBUG_CHECK(pad == '0' || pad == ' ');
+ DEBUG_CHECK(padding <= kSSizeMax);
+ DEBUG_CHECK(!(sign && prefix && *prefix));
+
+ // Handle negative numbers, if the caller indicated that |i| should be
+ // treated as a signed number; otherwise treat |i| as unsigned (even if the
+ // MSB is set!)
+ // Details are tricky, because of limited data-types, but equivalent pseudo-
+ // code would look like:
+ // if (sign && i < 0)
+ // prefix = "-";
+ // num = abs(i);
+ int minint = 0;
+ uint64_t num;
+ if (sign && i < 0) {
+ prefix = "-";
+
+ // Turn our number positive.
+ if (i == std::numeric_limits<int64_t>::min()) {
+ // The most negative integer needs special treatment.
+ minint = 1;
+ num = static_cast<uint64_t>(-(i + 1));
+ } else {
+ // "Normal" negative numbers are easy.
+ num = static_cast<uint64_t>(-i);
+ }
+ } else {
+ num = static_cast<uint64_t>(i);
+ }
+
+ // If padding with '0' zero, emit the prefix or '-' character now. Otherwise,
+ // make the prefix accessible in reverse order, so that we can later output
+ // it right between padding and the number.
+ // We cannot choose the easier approach of just reversing the number, as that
+ // fails in situations where we need to truncate numbers that have padding
+ // and/or prefixes.
+ const char* reverse_prefix = nullptr;
+ if (prefix && *prefix) {
+ if (pad == '0') {
+ while (*prefix) {
+ if (padding) {
+ --padding;
+ }
+ Out(*prefix++);
+ }
+ prefix = nullptr;
+ } else {
+ for (reverse_prefix = prefix; *reverse_prefix; ++reverse_prefix) {
+ }
+ }
+ } else
+ prefix = nullptr;
+ const size_t prefix_length = reverse_prefix - prefix;
+
+ // Loop until we have converted the entire number. Output at least one
+ // character (i.e. '0').
+ size_t start = count_;
+ size_t discarded = 0;
+ bool started = false;
+ do {
+ // Make sure there is still enough space left in our output buffer.
+ if (count_ >= size_) {
+ if (start < size_) {
+ // It is rare that we need to output a partial number. But if asked
+ // to do so, we will still make sure we output the correct number of
+ // leading digits.
+ // Since we are generating the digits in reverse order, we actually
+ // have to discard digits in the order that we have already emitted
+ // them. This is essentially equivalent to:
+ // memmove(buffer_ + start, buffer_ + start + 1, size_ - start - 1)
+ for (char* move = buffer_ + start, *end = buffer_ + size_ - 1;
+ move < end;
+ ++move) {
+ *move = move[1];
+ }
+ ++discarded;
+ --count_;
+ } else if (count_ - size_ > 1) {
+ // Need to increment either |count_| or |discarded| to make progress.
+ // The latter is more efficient, as it eventually triggers fast
+ // handling of padding. But we have to ensure we don't accidentally
+ // change the overall state (i.e. switch the state-machine from
+ // discarding to non-discarding). |count_| needs to always stay
+ // bigger than |size_|.
+ --count_;
+ ++discarded;
+ }
+ }
+
+ // Output the next digit and (if necessary) compensate for the most
+ // negative integer needing special treatment. This works because,
+ // no matter the bit width of the integer, the lowest-most decimal
+ // integer always ends in 2, 4, 6, or 8.
+ if (!num && started) {
+ if (reverse_prefix > prefix) {
+ Out(*--reverse_prefix);
+ } else {
+ Out(pad);
+ }
+ } else {
+ started = true;
+ Out((upcase ? kUpCaseHexDigits : kDownCaseHexDigits)[num%base + minint]);
+ }
+
+ minint = 0;
+ num /= base;
+
+ // Add padding, if requested.
+ if (padding > 0) {
+ --padding;
+
+ // Performance optimization for when we are asked to output excessive
+ // padding, but our output buffer is limited in size. Even if we output
+ // a 64bit number in binary, we would never write more than 64 plus
+ // prefix non-padding characters. So, once this limit has been passed,
+ // any further state change can be computed arithmetically; we know that
+ // by this time, our entire final output consists of padding characters
+ // that have all already been output.
+ if (discarded > 8*sizeof(num) + prefix_length) {
+ IncrementCount(padding);
+ padding = 0;
+ }
+ }
+ } while (num || padding || (reverse_prefix > prefix));
+
+ // Conversion to ASCII actually resulted in the digits being in reverse
+ // order. We can't easily generate them in forward order, as we can't tell
+ // the number of characters needed until we are done converting.
+ // So, now, we reverse the string (except for the possible '-' sign).
+ char* front = buffer_ + start;
+ char* back = GetInsertionPoint();
+ while (--back > front) {
+ char ch = *back;
+ *back = *front;
+ *front++ = ch;
+ }
+
+ IncrementCount(discarded);
+ return !discarded;
+}
+
+} // anonymous namespace
+
+namespace internal {
+
+ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt, const Arg* args,
+ const size_t max_args) {
+ // Make sure that at least one NUL byte can be written, and that the buffer
+ // never overflows kSSizeMax. Not only does that use up most or all of the
+ // address space, it also would result in a return code that cannot be
+ // represented.
+ if (static_cast<ssize_t>(sz) < 1)
+ return -1;
+ sz = std::min(sz, kSSizeMax);
+
+ // Iterate over format string and interpret '%' arguments as they are
+ // encountered.
+ Buffer buffer(buf, sz);
+ size_t padding;
+ char pad;
+ for (unsigned int cur_arg = 0; *fmt && !buffer.OutOfAddressableSpace(); ) {
+ if (*fmt++ == '%') {
+ padding = 0;
+ pad = ' ';
+ char ch = *fmt++;
+ format_character_found:
+ switch (ch) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ // Found a width parameter. Convert to an integer value and store in
+ // "padding". If the leading digit is a zero, change the padding
+ // character from a space ' ' to a zero '0'.
+ pad = ch == '0' ? '0' : ' ';
+ for (;;) {
+ // The maximum allowed padding fills all the available address
+ // space and leaves just enough space to insert the trailing NUL.
+ const size_t max_padding = kSSizeMax - 1;
+ if (padding > max_padding/10 ||
+ 10*padding > max_padding - (ch - '0')) {
+ DEBUG_CHECK(padding <= max_padding/10 &&
+ 10*padding <= max_padding - (ch - '0'));
+ // Integer overflow detected. Skip the rest of the width until
+ // we find the format character, then do the normal error handling.
+ padding_overflow:
+ padding = max_padding;
+ while ((ch = *fmt++) >= '0' && ch <= '9') {
+ }
+ if (cur_arg < max_args) {
+ ++cur_arg;
+ }
+ goto fail_to_expand;
+ }
+ padding = 10*padding + ch - '0';
+ if (padding > max_padding) {
+ // This doesn't happen for "sane" values of kSSizeMax. But once
+ // kSSizeMax gets smaller than about 10, our earlier range checks
+ // are incomplete. Unittests do trigger this artificial corner
+ // case.
+ DEBUG_CHECK(padding <= max_padding);
+ goto padding_overflow;
+ }
+ ch = *fmt++;
+ if (ch < '0' || ch > '9') {
+ // Reached the end of the width parameter. This is where the format
+ // character is found.
+ goto format_character_found;
+ }
+ }
+ break;
+ case 'c': { // Output an ASCII character.
+ // Check that there are arguments left to be inserted.
+ if (cur_arg >= max_args) {
+ DEBUG_CHECK(cur_arg < max_args);
+ goto fail_to_expand;
+ }
+
+ // Check that the argument has the expected type.
+ const Arg& arg = args[cur_arg++];
+ if (arg.type != Arg::INT && arg.type != Arg::UINT) {
+ DEBUG_CHECK(arg.type == Arg::INT || arg.type == Arg::UINT);
+ goto fail_to_expand;
+ }
+
+ // Apply padding, if needed.
+ buffer.Pad(' ', padding, 1);
+
+ // Convert the argument to an ASCII character and output it.
+ char as_char = static_cast<char>(arg.integer.i);
+ if (!as_char) {
+ goto end_of_output_buffer;
+ }
+ buffer.Out(as_char);
+ break; }
+ case 'd': // Output a possibly signed decimal value.
+ case 'o': // Output an unsigned octal value.
+ case 'x': // Output an unsigned hexadecimal value.
+ case 'X':
+ case 'p': { // Output a pointer value.
+ // Check that there are arguments left to be inserted.
+ if (cur_arg >= max_args) {
+ DEBUG_CHECK(cur_arg < max_args);
+ goto fail_to_expand;
+ }
+
+ const Arg& arg = args[cur_arg++];
+ int64_t i;
+ const char* prefix = nullptr;
+ if (ch != 'p') {
+ // Check that the argument has the expected type.
+ if (arg.type != Arg::INT && arg.type != Arg::UINT) {
+ DEBUG_CHECK(arg.type == Arg::INT || arg.type == Arg::UINT);
+ goto fail_to_expand;
+ }
+ i = arg.integer.i;
+
+ if (ch != 'd') {
+ // The Arg() constructor automatically performed sign expansion on
+ // signed parameters. This is great when outputting a %d decimal
+ // number, but can result in unexpected leading 0xFF bytes when
+ // outputting a %x hexadecimal number. Mask bits, if necessary.
+ // We have to do this here, instead of in the Arg() constructor, as
+ // the Arg() constructor cannot tell whether we will output a %d
+ // or a %x. Only the latter should experience masking.
+ if (arg.integer.width < sizeof(int64_t)) {
+ i &= (1LL << (8*arg.integer.width)) - 1;
+ }
+ }
+ } else {
+ // Pointer values require an actual pointer or a string.
+ if (arg.type == Arg::POINTER) {
+ i = reinterpret_cast<uintptr_t>(arg.ptr);
+ } else if (arg.type == Arg::STRING) {
+ i = reinterpret_cast<uintptr_t>(arg.str);
+ } else if (arg.type == Arg::INT &&
+ arg.integer.width == sizeof(NULL) &&
+ arg.integer.i == 0) { // Allow C++'s version of NULL
+ i = 0;
+ } else {
+ DEBUG_CHECK(arg.type == Arg::POINTER || arg.type == Arg::STRING);
+ goto fail_to_expand;
+ }
+
+ // Pointers always include the "0x" prefix.
+ prefix = "0x";
+ }
+
+ // Use IToASCII() to convert to ASCII representation. For decimal
+ // numbers, optionally print a sign. For hexadecimal numbers,
+ // distinguish between upper and lower case. %p addresses are always
+ // printed as upcase. Supports base 8, 10, and 16. Prints padding
+ // and/or prefixes, if so requested.
+ buffer.IToASCII(ch == 'd' && arg.type == Arg::INT,
+ ch != 'x', i,
+ ch == 'o' ? 8 : ch == 'd' ? 10 : 16,
+ pad, padding, prefix);
+ break; }
+ case 's': {
+ // Check that there are arguments left to be inserted.
+ if (cur_arg >= max_args) {
+ DEBUG_CHECK(cur_arg < max_args);
+ goto fail_to_expand;
+ }
+
+ // Check that the argument has the expected type.
+ const Arg& arg = args[cur_arg++];
+ const char *s;
+ if (arg.type == Arg::STRING) {
+ s = arg.str ? arg.str : "<NULL>";
+ } else if (arg.type == Arg::INT && arg.integer.width == sizeof(NULL) &&
+ arg.integer.i == 0) { // Allow C++'s version of NULL
+ s = "<NULL>";
+ } else {
+ DEBUG_CHECK(arg.type == Arg::STRING);
+ goto fail_to_expand;
+ }
+
+ // Apply padding, if needed. This requires us to first check the
+ // length of the string that we are outputting.
+ if (padding) {
+ size_t len = 0;
+ for (const char* src = s; *src++; ) {
+ ++len;
+ }
+ buffer.Pad(' ', padding, len);
+ }
+
+ // Printing a string involves nothing more than copying it into the
+ // output buffer and making sure we don't output more bytes than
+ // available space; Out() takes care of doing that.
+ for (const char* src = s; *src; ) {
+ buffer.Out(*src++);
+ }
+ break; }
+ case '%':
+ // Quoted percent '%' character.
+ goto copy_verbatim;
+ fail_to_expand:
+ // C++ gives us tools to do type checking -- something that snprintf()
+ // could never really do. So, whenever we see arguments that don't
+ // match up with the format string, we refuse to output them. But
+ // since we have to be extremely conservative about being async-
+ // signal-safe, we are limited in the type of error handling that we
+ // can do in production builds (in debug builds we can use
+ // DEBUG_CHECK() and hope for the best). So, all we do is pass the
+ // format string unchanged. That should eventually get the user's
+ // attention; and in the meantime, it hopefully doesn't lose too much
+ // data.
+ default:
+ // Unknown or unsupported format character. Just copy verbatim to
+ // output.
+ buffer.Out('%');
+ DEBUG_CHECK(ch);
+ if (!ch) {
+ goto end_of_format_string;
+ }
+ buffer.Out(ch);
+ break;
+ }
+ } else {
+ copy_verbatim:
+ buffer.Out(fmt[-1]);
+ }
+ }
+ end_of_format_string:
+ end_of_output_buffer:
+ return buffer.GetCount();
+}
+
+} // namespace internal
+
+ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt) {
+ // Make sure that at least one NUL byte can be written, and that the buffer
+ // never overflows kSSizeMax. Not only does that use up most or all of the
+ // address space, it also would result in a return code that cannot be
+ // represented.
+ if (static_cast<ssize_t>(sz) < 1)
+ return -1;
+ sz = std::min(sz, kSSizeMax);
+
+ Buffer buffer(buf, sz);
+
+ // In the slow-path, we deal with errors by copying the contents of
+ // "fmt" unexpanded. This means, if there are no arguments passed, the
+ // SafeSPrintf() function always degenerates to a version of strncpy() that
+ // de-duplicates '%' characters.
+ const char* src = fmt;
+ for (; *src; ++src) {
+ buffer.Out(*src);
+ DEBUG_CHECK(src[0] != '%' || src[1] == '%');
+ if (src[0] == '%' && src[1] == '%') {
+ ++src;
+ }
+ }
+ return buffer.GetCount();
+}
+
+} // namespace strings
+} // namespace base
diff --git a/base/strings/safe_sprintf.h b/base/strings/safe_sprintf.h
new file mode 100644
index 0000000..92f8c59
--- /dev/null
+++ b/base/strings/safe_sprintf.h
@@ -0,0 +1,246 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_SAFE_SPRINTF_H_
+#define BASE_STRINGS_SAFE_SPRINTF_H_
+
+#include "build/build_config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(OS_POSIX) || defined(OS_FUCHSIA)
+// For ssize_t
+#include <unistd.h>
+#endif
+
+#include "polyfills/base/base_export.h"
+
+namespace gurl_base {
+namespace strings {
+
+#if defined(COMPILER_MSVC)
+// Define ssize_t inside of our namespace.
+#if defined(_WIN64)
+typedef __int64 ssize_t;
+#else
+typedef long ssize_t;
+#endif
+#endif
+
+// SafeSPrintf() is a type-safe and completely self-contained version of
+// snprintf().
+//
+// SafeSNPrintf() is an alternative function signature that can be used when
+// not dealing with fixed-sized buffers. When possible, SafeSPrintf() should
+// always be used instead of SafeSNPrintf()
+//
+// These functions allow for formatting complicated messages from contexts that
+// require strict async-signal-safety. In fact, it is safe to call them from
+// any low-level execution context, as they are guaranteed to make no library
+// or system calls. It deliberately never touches "errno", either.
+//
+// The only exception to this rule is that in debug builds the code calls
+// RAW_CHECK() to help diagnose problems when the format string does not
+// match the rest of the arguments. In release builds, no GURL_CHECK()s are used,
+// and SafeSPrintf() instead returns an output string that expands only
+// those arguments that match their format characters. Mismatched arguments
+// are ignored.
+//
+// The code currently only supports a subset of format characters:
+// %c, %o, %d, %x, %X, %p, and %s.
+//
+// SafeSPrintf() aims to be as liberal as reasonably possible. Integer-like
+// values of arbitrary width can be passed to all of the format characters
+// that expect integers. Thus, it is explicitly legal to pass an "int" to
+// "%c", and output will automatically look at the LSB only. It is also
+// explicitly legal to pass either signed or unsigned values, and the format
+// characters will automatically interpret the arguments accordingly.
+//
+// It is still not legal to mix-and-match integer-like values with pointer
+// values. For instance, you cannot pass a pointer to %x, nor can you pass an
+// integer to %p.
+//
+// The one exception is "0" zero being accepted by "%p". This works-around
+// the problem of C++ defining NULL as an integer-like value.
+//
+// All format characters take an optional width parameter. This must be a
+// positive integer. For %d, %o, %x, %X and %p, if the width starts with
+// a leading '0', padding is done with '0' instead of ' ' characters.
+//
+// There are a few features of snprintf()-style format strings, that
+// SafeSPrintf() does not support at this time.
+//
+// If an actual user showed up, there is no particularly strong reason they
+// couldn't be added. But that assumes that the trade-offs between complexity
+// and utility are favorable.
+//
+// For example, adding support for negative padding widths, and for %n are all
+// likely to be viewed positively. They are all clearly useful, low-risk, easy
+// to test, don't jeopardize the async-signal-safety of the code, and overall
+// have little impact on other parts of SafeSPrintf() function.
+//
+// On the other hands, adding support for alternate forms, positional
+// arguments, grouping, wide characters, localization or floating point numbers
+// are all unlikely to ever be added.
+//
+// SafeSPrintf() and SafeSNPrintf() mimic the behavior of snprintf() and they
+// return the number of bytes needed to store the untruncated output. This
+// does *not* include the terminating NUL byte.
+//
+// They return -1, iff a fatal error happened. This typically can only happen,
+// if the buffer size is a) negative, or b) zero (i.e. not even the NUL byte
+// can be written). The return value can never be larger than SSIZE_MAX-1.
+// This ensures that the caller can always add one to the signed return code
+// in order to determine the amount of storage that needs to be allocated.
+//
+// While the code supports type checking and while it is generally very careful
+// to avoid printing incorrect values, it tends to be conservative in printing
+// as much as possible, even when given incorrect parameters. Typically, in
+// case of an error, the format string will not be expanded. (i.e. something
+// like SafeSPrintf(buf, "%p %d", 1, 2) results in "%p 2"). See above for
+// the use of RAW_CHECK() in debug builds, though.
+//
+// Basic example:
+// char buf[20];
+// gurl_base::strings::SafeSPrintf(buf, "The answer: %2d", 42);
+//
+// Example with dynamically sized buffer (async-signal-safe). This code won't
+// work on Visual studio, as it requires dynamically allocating arrays on the
+// stack. Consider picking a smaller value for |kMaxSize| if stack size is
+// limited and known. On the other hand, if the parameters to SafeSNPrintf()
+// are trusted and not controllable by the user, you can consider eliminating
+// the check for |kMaxSize| altogether. The current value of SSIZE_MAX is
+// essentially a no-op that just illustrates how to implement an upper bound:
+// const size_t kInitialSize = 128;
+// const size_t kMaxSize = std::numeric_limits<ssize_t>::max();
+// size_t size = kInitialSize;
+// for (;;) {
+// char buf[size];
+// size = SafeSNPrintf(buf, size, "Error message \"%s\"\n", err) + 1;
+// if (sizeof(buf) < kMaxSize && size > kMaxSize) {
+// size = kMaxSize;
+// continue;
+// } else if (size > sizeof(buf))
+// continue;
+// write(2, buf, size-1);
+// break;
+// }
+
+namespace internal {
+// Helpers that use C++ overloading, templates, and specializations to deduce
+// and record type information from function arguments. This allows us to
+// later write a type-safe version of snprintf().
+
+struct Arg {
+ enum Type { INT, UINT, STRING, POINTER };
+
+ // Any integer-like value.
+ Arg(signed char c) : type(INT) {
+ integer.i = c;
+ integer.width = sizeof(char);
+ }
+ Arg(unsigned char c) : type(UINT) {
+ integer.i = c;
+ integer.width = sizeof(char);
+ }
+ Arg(signed short j) : type(INT) {
+ integer.i = j;
+ integer.width = sizeof(short);
+ }
+ Arg(unsigned short j) : type(UINT) {
+ integer.i = j;
+ integer.width = sizeof(short);
+ }
+ Arg(signed int j) : type(INT) {
+ integer.i = j;
+ integer.width = sizeof(int);
+ }
+ Arg(unsigned int j) : type(UINT) {
+ integer.i = j;
+ integer.width = sizeof(int);
+ }
+ Arg(signed long j) : type(INT) {
+ integer.i = j;
+ integer.width = sizeof(long);
+ }
+ Arg(unsigned long j) : type(UINT) {
+ integer.i = j;
+ integer.width = sizeof(long);
+ }
+ Arg(signed long long j) : type(INT) {
+ integer.i = j;
+ integer.width = sizeof(long long);
+ }
+ Arg(unsigned long long j) : type(UINT) {
+ integer.i = j;
+ integer.width = sizeof(long long);
+ }
+
+ // A C-style text string.
+ Arg(const char* s) : str(s), type(STRING) { }
+ Arg(char* s) : str(s), type(STRING) { }
+
+ // Any pointer value that can be cast to a "void*".
+ template<class T> Arg(T* p) : ptr((void*)p), type(POINTER) { }
+
+ union {
+ // An integer-like value.
+ struct {
+ int64_t i;
+ unsigned char width;
+ } integer;
+
+ // A C-style text string.
+ const char* str;
+
+ // A pointer to an arbitrary object.
+ const void* ptr;
+ };
+ const enum Type type;
+};
+
+// This is the internal function that performs the actual formatting of
+// an snprintf()-style format string.
+BASE_EXPORT ssize_t SafeSNPrintf(char* buf, size_t sz, const char* fmt,
+ const Arg* args, size_t max_args);
+
+#if !defined(NDEBUG)
+// In debug builds, allow unit tests to artificially lower the kSSizeMax
+// constant that is used as a hard upper-bound for all buffers. In normal
+// use, this constant should always be std::numeric_limits<ssize_t>::max().
+BASE_EXPORT void SetSafeSPrintfSSizeMaxForTest(size_t max);
+BASE_EXPORT size_t GetSafeSPrintfSSizeMaxForTest();
+#endif
+
+} // namespace internal
+
+template<typename... Args>
+ssize_t SafeSNPrintf(char* buf, size_t N, const char* fmt, Args... args) {
+ // Use Arg() object to record type information and then copy arguments to an
+ // array to make it easier to iterate over them.
+ const internal::Arg arg_array[] = { args... };
+ return internal::SafeSNPrintf(buf, N, fmt, arg_array, sizeof...(args));
+}
+
+template<size_t N, typename... Args>
+ssize_t SafeSPrintf(char (&buf)[N], const char* fmt, Args... args) {
+ // Use Arg() object to record type information and then copy arguments to an
+ // array to make it easier to iterate over them.
+ const internal::Arg arg_array[] = { args... };
+ return internal::SafeSNPrintf(buf, N, fmt, arg_array, sizeof...(args));
+}
+
+// Fast-path when we don't actually need to substitute any arguments.
+BASE_EXPORT ssize_t SafeSNPrintf(char* buf, size_t N, const char* fmt);
+template<size_t N>
+inline ssize_t SafeSPrintf(char (&buf)[N], const char* fmt) {
+ return SafeSNPrintf(buf, N, fmt);
+}
+
+} // namespace strings
+} // namespace base
+
+#endif // BASE_STRINGS_SAFE_SPRINTF_H_
diff --git a/base/strings/safe_sprintf_unittest.cc b/base/strings/safe_sprintf_unittest.cc
new file mode 100644
index 0000000..b7a67fa
--- /dev/null
+++ b/base/strings/safe_sprintf_unittest.cc
@@ -0,0 +1,765 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/safe_sprintf.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <limits>
+#include <memory>
+
+#include "polyfills/base/logging.h"
+#include "base/macros.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// Death tests on Android are currently very flaky. No need to add more flaky
+// tests, as they just make it hard to spot real problems.
+// TODO(markus): See if the restrictions on Android can eventually be lifted.
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(OS_ANDROID)
+#define ALLOW_DEATH_TEST
+#endif
+
+namespace gurl_base {
+namespace strings {
+
+TEST(SafeSPrintfTest, Empty) {
+ char buf[2] = { 'X', 'X' };
+
+ // Negative buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), ""));
+ EXPECT_EQ('X', buf[0]);
+ EXPECT_EQ('X', buf[1]);
+
+ // Zero buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, 0, ""));
+ EXPECT_EQ('X', buf[0]);
+ EXPECT_EQ('X', buf[1]);
+
+ // A one-byte buffer should always print a single NUL byte.
+ EXPECT_EQ(0, SafeSNPrintf(buf, 1, ""));
+ EXPECT_EQ(0, buf[0]);
+ EXPECT_EQ('X', buf[1]);
+ buf[0] = 'X';
+
+ // A larger buffer should leave the trailing bytes unchanged.
+ EXPECT_EQ(0, SafeSNPrintf(buf, 2, ""));
+ EXPECT_EQ(0, buf[0]);
+ EXPECT_EQ('X', buf[1]);
+ buf[0] = 'X';
+
+ // The same test using SafeSPrintf() instead of SafeSNPrintf().
+ EXPECT_EQ(0, SafeSPrintf(buf, ""));
+ EXPECT_EQ(0, buf[0]);
+ EXPECT_EQ('X', buf[1]);
+ buf[0] = 'X';
+}
+
+TEST(SafeSPrintfTest, NoArguments) {
+ // Output a text message that doesn't require any substitutions. This
+ // is roughly equivalent to calling strncpy() (but unlike strncpy(), it does
+ // always add a trailing NUL; it always deduplicates '%' characters).
+ static const char text[] = "hello world";
+ char ref[20], buf[20];
+ memset(ref, 'X', sizeof(ref));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A negative buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), text));
+ EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf)));
+
+ // Zero buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, 0, text));
+ EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf)));
+
+ // A one-byte buffer should always print a single NUL byte.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSNPrintf(buf, 1, text));
+ EXPECT_EQ(0, buf[0]);
+ EXPECT_TRUE(!memcmp(buf+1, ref+1, sizeof(buf)-1));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A larger (but limited) buffer should always leave the trailing bytes
+ // unchanged.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSNPrintf(buf, 2, text));
+ EXPECT_EQ(text[0], buf[0]);
+ EXPECT_EQ(0, buf[1]);
+ EXPECT_TRUE(!memcmp(buf+2, ref+2, sizeof(buf)-2));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A unrestricted buffer length should always leave the trailing bytes
+ // unchanged.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1,
+ SafeSNPrintf(buf, sizeof(buf), text));
+ EXPECT_EQ(std::string(text), std::string(buf));
+ EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text),
+ sizeof(buf) - sizeof(text)));
+ memcpy(buf, ref, sizeof(buf));
+
+ // The same test using SafeSPrintf() instead of SafeSNPrintf().
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSPrintf(buf, text));
+ EXPECT_EQ(std::string(text), std::string(buf));
+ EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text),
+ sizeof(buf) - sizeof(text)));
+ memcpy(buf, ref, sizeof(buf));
+
+ // Check for deduplication of '%' percent characters.
+ EXPECT_EQ(1, SafeSPrintf(buf, "%%"));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%%"));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%X"));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%%%%X"));
+#if defined(NDEBUG)
+ EXPECT_EQ(1, SafeSPrintf(buf, "%"));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%"));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%X"));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%%%X"));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, "%"), "src.1. == '%'");
+ EXPECT_DEATH(SafeSPrintf(buf, "%%%"), "src.1. == '%'");
+ EXPECT_DEATH(SafeSPrintf(buf, "%X"), "src.1. == '%'");
+ EXPECT_DEATH(SafeSPrintf(buf, "%%%X"), "src.1. == '%'");
+#endif
+}
+
+TEST(SafeSPrintfTest, OneArgument) {
+ // Test basic single-argument single-character substitution.
+ const char text[] = "hello world";
+ const char fmt[] = "hello%cworld";
+ char ref[20], buf[20];
+ memset(ref, 'X', sizeof(buf));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A negative buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, static_cast<size_t>(-1), fmt, ' '));
+ EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf)));
+
+ // Zero buffer size should always result in an error.
+ EXPECT_EQ(-1, SafeSNPrintf(buf, 0, fmt, ' '));
+ EXPECT_TRUE(!memcmp(buf, ref, sizeof(buf)));
+
+ // A one-byte buffer should always print a single NUL byte.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1,
+ SafeSNPrintf(buf, 1, fmt, ' '));
+ EXPECT_EQ(0, buf[0]);
+ EXPECT_TRUE(!memcmp(buf+1, ref+1, sizeof(buf)-1));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A larger (but limited) buffer should always leave the trailing bytes
+ // unchanged.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1,
+ SafeSNPrintf(buf, 2, fmt, ' '));
+ EXPECT_EQ(text[0], buf[0]);
+ EXPECT_EQ(0, buf[1]);
+ EXPECT_TRUE(!memcmp(buf+2, ref+2, sizeof(buf)-2));
+ memcpy(buf, ref, sizeof(buf));
+
+ // A unrestricted buffer length should always leave the trailing bytes
+ // unchanged.
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1,
+ SafeSNPrintf(buf, sizeof(buf), fmt, ' '));
+ EXPECT_EQ(std::string(text), std::string(buf));
+ EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text),
+ sizeof(buf) - sizeof(text)));
+ memcpy(buf, ref, sizeof(buf));
+
+ // The same test using SafeSPrintf() instead of SafeSNPrintf().
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(text))-1, SafeSPrintf(buf, fmt, ' '));
+ EXPECT_EQ(std::string(text), std::string(buf));
+ EXPECT_TRUE(!memcmp(buf + sizeof(text), ref + sizeof(text),
+ sizeof(buf) - sizeof(text)));
+ memcpy(buf, ref, sizeof(buf));
+
+ // Check for deduplication of '%' percent characters.
+ EXPECT_EQ(1, SafeSPrintf(buf, "%%", 0));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%%", 0));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%Y", 0));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%Y", 0));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%%%Y", 0));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%%%%Y", 0));
+#if defined(NDEBUG)
+ EXPECT_EQ(1, SafeSPrintf(buf, "%", 0));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%", 0));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, "%", 0), "ch");
+ EXPECT_DEATH(SafeSPrintf(buf, "%%%", 0), "ch");
+#endif
+}
+
+TEST(SafeSPrintfTest, MissingArg) {
+#if defined(NDEBUG)
+ char buf[20];
+ EXPECT_EQ(3, SafeSPrintf(buf, "%c%c", 'A'));
+ EXPECT_EQ("A%c", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ char buf[20];
+ EXPECT_DEATH(SafeSPrintf(buf, "%c%c", 'A'), "cur_arg < max_args");
+#endif
+}
+
+TEST(SafeSPrintfTest, ASANFriendlyBufferTest) {
+ // Print into a buffer that is sized exactly to size. ASAN can verify that
+ // nobody attempts to write past the end of the buffer.
+ // There is a more complicated test in PrintLongString() that covers a lot
+ // more edge case, but it is also harder to debug in case of a failure.
+ const char kTestString[] = "This is a test";
+ std::unique_ptr<char[]> buf(new char[sizeof(kTestString)]);
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(kTestString) - 1),
+ SafeSNPrintf(buf.get(), sizeof(kTestString), kTestString));
+ EXPECT_EQ(std::string(kTestString), std::string(buf.get()));
+ EXPECT_EQ(static_cast<ssize_t>(sizeof(kTestString) - 1),
+ SafeSNPrintf(buf.get(), sizeof(kTestString), "%s", kTestString));
+ EXPECT_EQ(std::string(kTestString), std::string(buf.get()));
+}
+
+TEST(SafeSPrintfTest, NArgs) {
+ // Pre-C++11 compilers have a different code path, that can only print
+ // up to ten distinct arguments.
+ // We test both SafeSPrintf() and SafeSNPrintf(). This makes sure we don't
+ // have typos in the copy-n-pasted code that is needed to deal with various
+ // numbers of arguments.
+ char buf[12];
+ EXPECT_EQ(1, SafeSPrintf(buf, "%c", 1));
+ EXPECT_EQ("\1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%c%c", 1, 2));
+ EXPECT_EQ("\1\2", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%c%c%c", 1, 2, 3));
+ EXPECT_EQ("\1\2\3", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%c%c%c%c", 1, 2, 3, 4));
+ EXPECT_EQ("\1\2\3\4", std::string(buf));
+ EXPECT_EQ(5, SafeSPrintf(buf, "%c%c%c%c%c", 1, 2, 3, 4, 5));
+ EXPECT_EQ("\1\2\3\4\5", std::string(buf));
+ EXPECT_EQ(6, SafeSPrintf(buf, "%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6));
+ EXPECT_EQ("\1\2\3\4\5\6", std::string(buf));
+ EXPECT_EQ(7, SafeSPrintf(buf, "%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7));
+ EXPECT_EQ("\1\2\3\4\5\6\7", std::string(buf));
+ EXPECT_EQ(8, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7, 8));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10", std::string(buf));
+ EXPECT_EQ(9, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11", std::string(buf));
+ EXPECT_EQ(10, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
+
+ // Repeat all the tests with SafeSNPrintf() instead of SafeSPrintf().
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12", std::string(buf));
+ EXPECT_EQ(1, SafeSNPrintf(buf, 11, "%c", 1));
+ EXPECT_EQ("\1", std::string(buf));
+ EXPECT_EQ(2, SafeSNPrintf(buf, 11, "%c%c", 1, 2));
+ EXPECT_EQ("\1\2", std::string(buf));
+ EXPECT_EQ(3, SafeSNPrintf(buf, 11, "%c%c%c", 1, 2, 3));
+ EXPECT_EQ("\1\2\3", std::string(buf));
+ EXPECT_EQ(4, SafeSNPrintf(buf, 11, "%c%c%c%c", 1, 2, 3, 4));
+ EXPECT_EQ("\1\2\3\4", std::string(buf));
+ EXPECT_EQ(5, SafeSNPrintf(buf, 11, "%c%c%c%c%c", 1, 2, 3, 4, 5));
+ EXPECT_EQ("\1\2\3\4\5", std::string(buf));
+ EXPECT_EQ(6, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6));
+ EXPECT_EQ("\1\2\3\4\5\6", std::string(buf));
+ EXPECT_EQ(7, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c", 1, 2, 3, 4, 5, 6, 7));
+ EXPECT_EQ("\1\2\3\4\5\6\7", std::string(buf));
+ EXPECT_EQ(8, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10", std::string(buf));
+ EXPECT_EQ(9, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11", std::string(buf));
+ EXPECT_EQ(10, SafeSNPrintf(buf, 11, "%c%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12", std::string(buf));
+
+ EXPECT_EQ(11, SafeSPrintf(buf, "%c%c%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12\13", std::string(buf));
+ EXPECT_EQ(11, SafeSNPrintf(buf, 12, "%c%c%c%c%c%c%c%c%c%c%c",
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
+ EXPECT_EQ("\1\2\3\4\5\6\7\10\11\12\13", std::string(buf));
+}
+
+TEST(SafeSPrintfTest, DataTypes) {
+ char buf[40];
+
+ // Bytes
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint8_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%d", (uint8_t)-1));
+ EXPECT_EQ("255", std::string(buf));
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int8_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int8_t)-1));
+ EXPECT_EQ("-1", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%d", (int8_t)-128));
+ EXPECT_EQ("-128", std::string(buf));
+
+ // Half-words
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint16_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(5, SafeSPrintf(buf, "%d", (uint16_t)-1));
+ EXPECT_EQ("65535", std::string(buf));
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int16_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int16_t)-1));
+ EXPECT_EQ("-1", std::string(buf));
+ EXPECT_EQ(6, SafeSPrintf(buf, "%d", (int16_t)-32768));
+ EXPECT_EQ("-32768", std::string(buf));
+
+ // Words
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint32_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(10, SafeSPrintf(buf, "%d", (uint32_t)-1));
+ EXPECT_EQ("4294967295", std::string(buf));
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int32_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int32_t)-1));
+ EXPECT_EQ("-1", std::string(buf));
+ // Work-around for an limitation of C90
+ EXPECT_EQ(11, SafeSPrintf(buf, "%d", (int32_t)-2147483647-1));
+ EXPECT_EQ("-2147483648", std::string(buf));
+
+ // Quads
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (uint64_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(20, SafeSPrintf(buf, "%d", (uint64_t)-1));
+ EXPECT_EQ("18446744073709551615", std::string(buf));
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", (int64_t)1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%d", (int64_t)-1));
+ EXPECT_EQ("-1", std::string(buf));
+ // Work-around for an limitation of C90
+ EXPECT_EQ(20, SafeSPrintf(buf, "%d", (int64_t)-9223372036854775807LL-1));
+ EXPECT_EQ("-9223372036854775808", std::string(buf));
+
+ // Strings (both const and mutable).
+ EXPECT_EQ(4, SafeSPrintf(buf, "test"));
+ EXPECT_EQ("test", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, buf));
+ EXPECT_EQ("test", std::string(buf));
+
+ // Pointer
+ char addr[20];
+ sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)buf);
+ SafeSPrintf(buf, "%p", buf);
+ EXPECT_EQ(std::string(addr), std::string(buf));
+ SafeSPrintf(buf, "%p", (const char *)buf);
+ EXPECT_EQ(std::string(addr), std::string(buf));
+ sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)sprintf);
+ SafeSPrintf(buf, "%p", sprintf);
+ EXPECT_EQ(std::string(addr), std::string(buf));
+
+ // Padding for pointers is a little more complicated because of the "0x"
+ // prefix. Padding with '0' zeros is relatively straight-forward, but
+ // padding with ' ' spaces requires more effort.
+ sprintf(addr, "0x%017llX", (unsigned long long)(uintptr_t)buf);
+ SafeSPrintf(buf, "%019p", buf);
+ EXPECT_EQ(std::string(addr), std::string(buf));
+ sprintf(addr, "0x%llX", (unsigned long long)(uintptr_t)buf);
+ memset(addr, ' ',
+ (char*)memmove(addr + sizeof(addr) - strlen(addr) - 1,
+ addr, strlen(addr)+1) - addr);
+ SafeSPrintf(buf, "%19p", buf);
+ EXPECT_EQ(std::string(addr), std::string(buf));
+}
+
+namespace {
+void PrintLongString(char* buf, size_t sz) {
+ // Output a reasonably complex expression into a limited-size buffer.
+ // At least one byte is available for writing the NUL character.
+ GURL_CHECK_GT(sz, static_cast<size_t>(0));
+
+ // Allocate slightly more space, so that we can verify that SafeSPrintf()
+ // never writes past the end of the buffer.
+ std::unique_ptr<char[]> tmp(new char[sz + 2]);
+ memset(tmp.get(), 'X', sz+2);
+
+ // Use SafeSPrintf() to output a complex list of arguments:
+ // - test padding and truncating %c single characters.
+ // - test truncating %s simple strings.
+ // - test mismatching arguments and truncating (for %d != %s).
+ // - test zero-padding and truncating %x hexadecimal numbers.
+ // - test outputting and truncating %d MININT.
+ // - test outputting and truncating %p arbitrary pointer values.
+ // - test outputting, padding and truncating NULL-pointer %s strings.
+ char* out = tmp.get();
+ size_t out_sz = sz;
+ size_t len;
+ for (std::unique_ptr<char[]> perfect_buf;;) {
+ size_t needed =
+ SafeSNPrintf(out, out_sz,
+#if defined(NDEBUG)
+ "A%2cong %s: %d %010X %d %p%7s", 'l', "string", "",
+#else
+ "A%2cong %s: %%d %010X %d %p%7s", 'l', "string",
+#endif
+ 0xDEADBEEF, std::numeric_limits<intptr_t>::min(),
+ PrintLongString, static_cast<char*>(nullptr)) +
+ 1;
+
+ // Various sanity checks:
+ // The numbered of characters needed to print the full string should always
+ // be bigger or equal to the bytes that have actually been output.
+ len = strlen(tmp.get());
+ GURL_CHECK_GE(needed, len+1);
+
+ // The number of characters output should always fit into the buffer that
+ // was passed into SafeSPrintf().
+ GURL_CHECK_LT(len, out_sz);
+
+ // The output is always terminated with a NUL byte (actually, this test is
+ // always going to pass, as strlen() already verified this)
+ EXPECT_FALSE(tmp[len]);
+
+ // ASAN can check that we are not overwriting buffers, iff we make sure the
+ // buffer is exactly the size that we are expecting to be written. After
+ // running SafeSNPrintf() the first time, it is possible to compute the
+ // correct buffer size for this test. So, allocate a second buffer and run
+ // the exact same SafeSNPrintf() command again.
+ if (!perfect_buf.get()) {
+ out_sz = std::min(needed, sz);
+ out = new char[out_sz];
+ perfect_buf.reset(out);
+ } else {
+ break;
+ }
+ }
+
+ // All trailing bytes are unchanged.
+ for (size_t i = len+1; i < sz+2; ++i)
+ EXPECT_EQ('X', tmp[i]);
+
+ // The text that was generated by SafeSPrintf() should always match the
+ // equivalent text generated by sprintf(). Please note that the format
+ // string for sprintf() is not complicated, as it does not have the
+ // benefit of getting type information from the C++ compiler.
+ //
+ // N.B.: It would be so much cleaner to use snprintf(). But unfortunately,
+ // Visual Studio doesn't support this function, and the work-arounds
+ // are all really awkward.
+ char ref[256];
+ GURL_CHECK_LE(sz, sizeof(ref));
+ sprintf(ref, "A long string: %%d 00DEADBEEF %lld 0x%llX <NULL>",
+ static_cast<long long>(std::numeric_limits<intptr_t>::min()),
+ static_cast<unsigned long long>(
+ reinterpret_cast<uintptr_t>(PrintLongString)));
+ ref[sz-1] = '\000';
+
+#if defined(NDEBUG)
+ const size_t kSSizeMax = std::numeric_limits<ssize_t>::max();
+#else
+ const size_t kSSizeMax = internal::GetSafeSPrintfSSizeMaxForTest();
+#endif
+
+ // Compare the output from SafeSPrintf() to the one from sprintf().
+ EXPECT_EQ(std::string(ref).substr(0, kSSizeMax-1), std::string(tmp.get()));
+
+ // We allocated a slightly larger buffer, so that we could perform some
+ // extra sanity checks. Now that the tests have all passed, we copy the
+ // data to the output buffer that the caller provided.
+ memcpy(buf, tmp.get(), len+1);
+}
+
+#if !defined(NDEBUG)
+class ScopedSafeSPrintfSSizeMaxSetter {
+ public:
+ ScopedSafeSPrintfSSizeMaxSetter(size_t sz) {
+ old_ssize_max_ = internal::GetSafeSPrintfSSizeMaxForTest();
+ internal::SetSafeSPrintfSSizeMaxForTest(sz);
+ }
+
+ ~ScopedSafeSPrintfSSizeMaxSetter() {
+ internal::SetSafeSPrintfSSizeMaxForTest(old_ssize_max_);
+ }
+
+ private:
+ size_t old_ssize_max_;
+
+ DISALLOW_COPY_AND_ASSIGN(ScopedSafeSPrintfSSizeMaxSetter);
+};
+#endif
+
+} // anonymous namespace
+
+TEST(SafeSPrintfTest, Truncation) {
+ // We use PrintLongString() to print a complex long string and then
+ // truncate to all possible lengths. This ends up exercising a lot of
+ // different code paths in SafeSPrintf() and IToASCII(), as truncation can
+ // happen in a lot of different states.
+ char ref[256];
+ PrintLongString(ref, sizeof(ref));
+ for (size_t i = strlen(ref)+1; i; --i) {
+ char buf[sizeof(ref)];
+ PrintLongString(buf, i);
+ EXPECT_EQ(std::string(ref, i - 1), std::string(buf));
+ }
+
+ // When compiling in debug mode, we have the ability to fake a small
+ // upper limit for the maximum value that can be stored in an ssize_t.
+ // SafeSPrintf() uses this upper limit to determine how many bytes it will
+ // write to the buffer, even if the caller claimed a bigger buffer size.
+ // Repeat the truncation test and verify that this other code path in
+ // SafeSPrintf() works correctly, too.
+#if !defined(NDEBUG)
+ for (size_t i = strlen(ref)+1; i > 1; --i) {
+ ScopedSafeSPrintfSSizeMaxSetter ssize_max_setter(i);
+ char buf[sizeof(ref)];
+ PrintLongString(buf, sizeof(buf));
+ EXPECT_EQ(std::string(ref, i - 1), std::string(buf));
+ }
+
+ // kSSizeMax is also used to constrain the maximum amount of padding, before
+ // SafeSPrintf() detects an error in the format string.
+ ScopedSafeSPrintfSSizeMaxSetter ssize_max_setter(100);
+ char buf[256];
+ EXPECT_EQ(99, SafeSPrintf(buf, "%99c", ' '));
+ EXPECT_EQ(std::string(99, ' '), std::string(buf));
+ *buf = '\000';
+#if defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, "%100c", ' '), "padding <= max_padding");
+#endif
+ EXPECT_EQ(0, *buf);
+#endif
+}
+
+TEST(SafeSPrintfTest, Padding) {
+ char buf[40], fmt[40];
+
+ // Chars %c
+ EXPECT_EQ(1, SafeSPrintf(buf, "%c", 'A'));
+ EXPECT_EQ("A", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%2c", 'A'));
+ EXPECT_EQ(" A", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%02c", 'A'));
+ EXPECT_EQ(" A", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2c", 'A'));
+ EXPECT_EQ("%-2c", std::string(buf));
+ SafeSPrintf(fmt, "%%%dc", std::numeric_limits<ssize_t>::max() - 1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1, SafeSPrintf(buf, fmt, 'A'));
+ SafeSPrintf(fmt, "%%%dc",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, 'A'));
+ EXPECT_EQ("%c", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, 'A'), "padding <= max_padding");
+#endif
+
+ // Octal %o
+ EXPECT_EQ(1, SafeSPrintf(buf, "%o", 1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%2o", 1));
+ EXPECT_EQ(" 1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%02o", 1));
+ EXPECT_EQ("01", std::string(buf));
+ EXPECT_EQ(12, SafeSPrintf(buf, "%12o", -1));
+ EXPECT_EQ(" 37777777777", std::string(buf));
+ EXPECT_EQ(12, SafeSPrintf(buf, "%012o", -1));
+ EXPECT_EQ("037777777777", std::string(buf));
+ EXPECT_EQ(23, SafeSPrintf(buf, "%23o", -1LL));
+ EXPECT_EQ(" 1777777777777777777777", std::string(buf));
+ EXPECT_EQ(23, SafeSPrintf(buf, "%023o", -1LL));
+ EXPECT_EQ("01777777777777777777777", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%2o", 0111));
+ EXPECT_EQ("111", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2o", 1));
+ EXPECT_EQ("%-2o", std::string(buf));
+ SafeSPrintf(fmt, "%%%do", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%0%do", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ("000", std::string(buf));
+ SafeSPrintf(fmt, "%%%do",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1));
+ EXPECT_EQ("%o", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding");
+#endif
+
+ // Decimals %d
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", 1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%2d", 1));
+ EXPECT_EQ(" 1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%02d", 1));
+ EXPECT_EQ("01", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%3d", -1));
+ EXPECT_EQ(" -1", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%03d", -1));
+ EXPECT_EQ("-01", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%2d", 111));
+ EXPECT_EQ("111", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%2d", -111));
+ EXPECT_EQ("-111", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2d", 1));
+ EXPECT_EQ("%-2d", std::string(buf));
+ SafeSPrintf(fmt, "%%%dd", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%0%dd", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ("000", std::string(buf));
+ SafeSPrintf(fmt, "%%%dd",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1));
+ EXPECT_EQ("%d", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding");
+#endif
+
+ // Hex %X
+ EXPECT_EQ(1, SafeSPrintf(buf, "%X", 1));
+ EXPECT_EQ("1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%2X", 1));
+ EXPECT_EQ(" 1", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%02X", 1));
+ EXPECT_EQ("01", std::string(buf));
+ EXPECT_EQ(9, SafeSPrintf(buf, "%9X", -1));
+ EXPECT_EQ(" FFFFFFFF", std::string(buf));
+ EXPECT_EQ(9, SafeSPrintf(buf, "%09X", -1));
+ EXPECT_EQ("0FFFFFFFF", std::string(buf));
+ EXPECT_EQ(17, SafeSPrintf(buf, "%17X", -1LL));
+ EXPECT_EQ(" FFFFFFFFFFFFFFFF", std::string(buf));
+ EXPECT_EQ(17, SafeSPrintf(buf, "%017X", -1LL));
+ EXPECT_EQ("0FFFFFFFFFFFFFFFF", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%2X", 0x111));
+ EXPECT_EQ("111", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2X", 1));
+ EXPECT_EQ("%-2X", std::string(buf));
+ SafeSPrintf(fmt, "%%%dX", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%0%dX", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, 1));
+ EXPECT_EQ("000", std::string(buf));
+ SafeSPrintf(fmt, "%%%dX",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1));
+ EXPECT_EQ("%X", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding");
+#endif
+
+ // Pointer %p
+ EXPECT_EQ(3, SafeSPrintf(buf, "%p", (void*)1));
+ EXPECT_EQ("0x1", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%4p", (void*)1));
+ EXPECT_EQ(" 0x1", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%04p", (void*)1));
+ EXPECT_EQ("0x01", std::string(buf));
+ EXPECT_EQ(5, SafeSPrintf(buf, "%4p", (void*)0x111));
+ EXPECT_EQ("0x111", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2p", (void*)1));
+ EXPECT_EQ("%-2p", std::string(buf));
+ SafeSPrintf(fmt, "%%%dp", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, (void*)1));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%0%dp", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, (void*)1));
+ EXPECT_EQ("0x0", std::string(buf));
+ SafeSPrintf(fmt, "%%%dp",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, 1));
+ EXPECT_EQ("%p", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, 1), "padding <= max_padding");
+#endif
+
+ // String
+ EXPECT_EQ(1, SafeSPrintf(buf, "%s", "A"));
+ EXPECT_EQ("A", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%2s", "A"));
+ EXPECT_EQ(" A", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%02s", "A"));
+ EXPECT_EQ(" A", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%2s", "AAA"));
+ EXPECT_EQ("AAA", std::string(buf));
+ EXPECT_EQ(4, SafeSPrintf(buf, "%-2s", "A"));
+ EXPECT_EQ("%-2s", std::string(buf));
+ SafeSPrintf(fmt, "%%%ds", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, "A"));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%0%ds", std::numeric_limits<ssize_t>::max()-1);
+ EXPECT_EQ(std::numeric_limits<ssize_t>::max()-1,
+ SafeSNPrintf(buf, 4, fmt, "A"));
+ EXPECT_EQ(" ", std::string(buf));
+ SafeSPrintf(fmt, "%%%ds",
+ static_cast<size_t>(std::numeric_limits<ssize_t>::max()));
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, fmt, "A"));
+ EXPECT_EQ("%s", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, fmt, "A"), "padding <= max_padding");
+#endif
+}
+
+TEST(SafeSPrintfTest, EmbeddedNul) {
+ char buf[] = { 'X', 'X', 'X', 'X' };
+ EXPECT_EQ(2, SafeSPrintf(buf, "%3c", 0));
+ EXPECT_EQ(' ', buf[0]);
+ EXPECT_EQ(' ', buf[1]);
+ EXPECT_EQ(0, buf[2]);
+ EXPECT_EQ('X', buf[3]);
+
+ // Check handling of a NUL format character. N.B. this takes two different
+ // code paths depending on whether we are actually passing arguments. If
+ // we don't have any arguments, we are running in the fast-path code, that
+ // looks (almost) like a strncpy().
+#if defined(NDEBUG)
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%"));
+ EXPECT_EQ("%%", std::string(buf));
+ EXPECT_EQ(2, SafeSPrintf(buf, "%%%", 0));
+ EXPECT_EQ("%%", std::string(buf));
+#elif defined(ALLOW_DEATH_TEST)
+ EXPECT_DEATH(SafeSPrintf(buf, "%%%"), "src.1. == '%'");
+ EXPECT_DEATH(SafeSPrintf(buf, "%%%", 0), "ch");
+#endif
+}
+
+TEST(SafeSPrintfTest, EmitNULL) {
+ char buf[40];
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion-null"
+#endif
+ EXPECT_EQ(1, SafeSPrintf(buf, "%d", NULL));
+ EXPECT_EQ("0", std::string(buf));
+ EXPECT_EQ(3, SafeSPrintf(buf, "%p", NULL));
+ EXPECT_EQ("0x0", std::string(buf));
+ EXPECT_EQ(6, SafeSPrintf(buf, "%s", NULL));
+ EXPECT_EQ("<NULL>", std::string(buf));
+#if defined(__GCC__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+TEST(SafeSPrintfTest, PointerSize) {
+ // The internal data representation is a 64bit value, independent of the
+ // native word size. We want to perform sign-extension for signed integers,
+ // but we want to avoid doing so for pointer types. This could be a
+ // problem on systems, where pointers are only 32bit. This tests verifies
+ // that there is no such problem.
+ char *str = reinterpret_cast<char *>(0x80000000u);
+ void *ptr = str;
+ char buf[40];
+ EXPECT_EQ(10, SafeSPrintf(buf, "%p", str));
+ EXPECT_EQ("0x80000000", std::string(buf));
+ EXPECT_EQ(10, SafeSPrintf(buf, "%p", ptr));
+ EXPECT_EQ("0x80000000", std::string(buf));
+}
+
+} // namespace strings
+} // namespace base
diff --git a/base/strings/strcat.cc b/base/strings/strcat.cc
new file mode 100644
index 0000000..1774a15
--- /dev/null
+++ b/base/strings/strcat.cc
@@ -0,0 +1,81 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/strcat.h"
+
+namespace gurl_base {
+
+namespace {
+
+// Reserves an additional amount of size in the given string, growing by at
+// least 2x. Used by StrAppend().
+//
+// The "at least 2x" growing rule duplicates the exponential growth of
+// std::string. The problem is that most implementations of reserve() will grow
+// exactly to the requested amount instead of exponentially growing like would
+// happen when appending normally. If we didn't do this, an append after the
+// call to StrAppend() would definitely cause a reallocation, and loops with
+// StrAppend() calls would have O(n^2) complexity to execute. Instead, we want
+// StrAppend() to have the same semantics as std::string::append().
+//
+// If the string is empty, we assume that exponential growth is not necessary.
+template <typename String>
+void ReserveAdditional(String* str, typename String::size_type additional) {
+ str->reserve(std::max(str->size() + additional, str->size() * 2));
+}
+
+template <typename DestString, typename InputString>
+void StrAppendT(DestString* dest, span<const InputString> pieces) {
+ size_t additional_size = 0;
+ for (const auto& cur : pieces)
+ additional_size += cur.size();
+ ReserveAdditional(dest, additional_size);
+
+ for (const auto& cur : pieces)
+ dest->append(cur.data(), cur.size());
+}
+
+} // namespace
+
+std::string StrCat(span<const StringPiece> pieces) {
+ std::string result;
+ StrAppendT(&result, pieces);
+ return result;
+}
+
+string16 StrCat(span<const StringPiece16> pieces) {
+ string16 result;
+ StrAppendT(&result, pieces);
+ return result;
+}
+
+std::string StrCat(span<const std::string> pieces) {
+ std::string result;
+ StrAppendT(&result, pieces);
+ return result;
+}
+
+string16 StrCat(span<const string16> pieces) {
+ string16 result;
+ StrAppendT(&result, pieces);
+ return result;
+}
+
+void StrAppend(std::string* dest, span<const StringPiece> pieces) {
+ StrAppendT(dest, pieces);
+}
+
+void StrAppend(string16* dest, span<const StringPiece16> pieces) {
+ StrAppendT(dest, pieces);
+}
+
+void StrAppend(std::string* dest, span<const std::string> pieces) {
+ StrAppendT(dest, pieces);
+}
+
+void StrAppend(string16* dest, span<const string16> pieces) {
+ StrAppendT(dest, pieces);
+}
+
+} // namespace base
diff --git a/base/strings/strcat.h b/base/strings/strcat.h
new file mode 100644
index 0000000..bcdfe17
--- /dev/null
+++ b/base/strings/strcat.h
@@ -0,0 +1,100 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRCAT_H_
+#define BASE_STRINGS_STRCAT_H_
+
+#include <initializer_list>
+
+#include "polyfills/base/base_export.h"
+#include "base/compiler_specific.h"
+#include "base/containers/span.h"
+#include "base/strings/string_piece.h"
+#include "build/build_config.h"
+
+#if defined(OS_WIN)
+// Guard against conflict with Win32 API StrCat macro:
+// check StrCat wasn't and will not be redefined.
+#define StrCat StrCat
+#endif
+
+namespace gurl_base {
+
+// StrCat ----------------------------------------------------------------------
+//
+// StrCat is a function to perform concatenation on a sequence of strings.
+// It is preferrable to a sequence of "a + b + c" because it is both faster and
+// generates less code.
+//
+// std::string result = gurl_base::StrCat({"foo ", result, "\nfoo ", bar});
+//
+// To join an array of strings with a separator, see gurl_base::JoinString in
+// base/strings/string_util.h.
+//
+// MORE INFO
+//
+// StrCat can see all arguments at once, so it can allocate one return buffer
+// of exactly the right size and copy once, as opposed to a sequence of
+// operator+ which generates a series of temporary strings, copying as it goes.
+// And by using StringPiece arguments, StrCat can avoid creating temporary
+// string objects for char* constants.
+//
+// ALTERNATIVES
+//
+// Internal Google / Abseil has a similar StrCat function. That version takes
+// an overloaded number of arguments instead of initializer list (overflowing
+// to initializer list for many arguments). We don't have any legacy
+// requirements and using only initializer_list is simpler and generates
+// roughly the same amount of code at the call sites.
+//
+// Abseil's StrCat also allows numbers by using an intermediate class that can
+// be implicitly constructed from either a string or various number types. This
+// class formats the numbers into a static buffer for increased performance,
+// and the call sites look nice.
+//
+// As-written Abseil's helper class for numbers generates slightly more code
+// than the raw StringPiece version. We can de-inline the helper class'
+// constructors which will cause the StringPiece constructors to be de-inlined
+// for this call and generate slightly less code. This is something we can
+// explore more in the future.
+
+BASE_EXPORT std::string StrCat(span<const StringPiece> pieces);
+BASE_EXPORT string16 StrCat(span<const StringPiece16> pieces);
+BASE_EXPORT std::string StrCat(span<const std::string> pieces);
+BASE_EXPORT string16 StrCat(span<const string16> pieces);
+
+// Initializer list forwards to the array version.
+inline std::string StrCat(std::initializer_list<StringPiece> pieces) {
+ return StrCat(make_span(pieces.begin(), pieces.size()));
+}
+inline string16 StrCat(std::initializer_list<StringPiece16> pieces) {
+ return StrCat(make_span(pieces.begin(), pieces.size()));
+}
+
+// StrAppend -------------------------------------------------------------------
+//
+// Appends a sequence of strings to a destination. Prefer:
+// StrAppend(&foo, ...);
+// over:
+// foo += StrCat(...);
+// because it avoids a temporary string allocation and copy.
+
+BASE_EXPORT void StrAppend(std::string* dest, span<const StringPiece> pieces);
+BASE_EXPORT void StrAppend(string16* dest, span<const StringPiece16> pieces);
+BASE_EXPORT void StrAppend(std::string* dest, span<const std::string> pieces);
+BASE_EXPORT void StrAppend(string16* dest, span<const string16> pieces);
+
+// Initializer list forwards to the array version.
+inline void StrAppend(std::string* dest,
+ std::initializer_list<StringPiece> pieces) {
+ return StrAppend(dest, make_span(pieces.begin(), pieces.size()));
+}
+inline void StrAppend(string16* dest,
+ std::initializer_list<StringPiece16> pieces) {
+ return StrAppend(dest, make_span(pieces.begin(), pieces.size()));
+}
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRCAT_H_
diff --git a/base/strings/strcat_unittest.cc b/base/strings/strcat_unittest.cc
new file mode 100644
index 0000000..d51b840
--- /dev/null
+++ b/base/strings/strcat_unittest.cc
@@ -0,0 +1,67 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/strcat.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+TEST(StrCat, 8Bit) {
+ EXPECT_EQ("", StrCat({""}));
+ EXPECT_EQ("1", StrCat({"1"}));
+ EXPECT_EQ("122", StrCat({"1", "22"}));
+ EXPECT_EQ("122333", StrCat({"1", "22", "333"}));
+ EXPECT_EQ("1223334444", StrCat({"1", "22", "333", "4444"}));
+ EXPECT_EQ("122333444455555", StrCat({"1", "22", "333", "4444", "55555"}));
+}
+
+TEST(StrCat, 16Bit) {
+ string16 arg1 = ASCIIToUTF16("1");
+ string16 arg2 = ASCIIToUTF16("22");
+ string16 arg3 = ASCIIToUTF16("333");
+
+ EXPECT_EQ(ASCIIToUTF16(""), StrCat({string16()}));
+ EXPECT_EQ(ASCIIToUTF16("1"), StrCat({arg1}));
+ EXPECT_EQ(ASCIIToUTF16("122"), StrCat({arg1, arg2}));
+ EXPECT_EQ(ASCIIToUTF16("122333"), StrCat({arg1, arg2, arg3}));
+}
+
+TEST(StrAppend, 8Bit) {
+ std::string result;
+
+ result = "foo";
+ StrAppend(&result, {std::string()});
+ EXPECT_EQ("foo", result);
+
+ result = "foo";
+ StrAppend(&result, {"1"});
+ EXPECT_EQ("foo1", result);
+
+ result = "foo";
+ StrAppend(&result, {"1", "22", "333"});
+ EXPECT_EQ("foo122333", result);
+}
+
+TEST(StrAppend, 16Bit) {
+ string16 arg1 = ASCIIToUTF16("1");
+ string16 arg2 = ASCIIToUTF16("22");
+ string16 arg3 = ASCIIToUTF16("333");
+
+ string16 result;
+
+ result = ASCIIToUTF16("foo");
+ StrAppend(&result, {string16()});
+ EXPECT_EQ(ASCIIToUTF16("foo"), result);
+
+ result = ASCIIToUTF16("foo");
+ StrAppend(&result, {arg1});
+ EXPECT_EQ(ASCIIToUTF16("foo1"), result);
+
+ result = ASCIIToUTF16("foo");
+ StrAppend(&result, {arg1, arg2, arg3});
+ EXPECT_EQ(ASCIIToUTF16("foo122333"), result);
+}
+
+} // namespace base
diff --git a/base/strings/string16.cc b/base/strings/string16.cc
new file mode 100644
index 0000000..6ac8b8b
--- /dev/null
+++ b/base/strings/string16.cc
@@ -0,0 +1,87 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string16.h"
+
+#if defined(WCHAR_T_IS_UTF16) && !defined(_AIX)
+
+#error This file should not be used on 2-byte wchar_t systems
+// If this winds up being needed on 2-byte wchar_t systems, either the
+// definitions below can be used, or the host system's wide character
+// functions like wmemcmp can be wrapped.
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+#include <ostream>
+
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+int c16memcmp(const char16* s1, const char16* s2, size_t n) {
+ // We cannot call memcmp because that changes the semantics.
+ while (n-- > 0) {
+ if (*s1 != *s2) {
+ // We cannot use (*s1 - *s2) because char16 is unsigned.
+ return ((*s1 < *s2) ? -1 : 1);
+ }
+ ++s1;
+ ++s2;
+ }
+ return 0;
+}
+
+size_t c16len(const char16* s) {
+ const char16 *s_orig = s;
+ while (*s) {
+ ++s;
+ }
+ return s - s_orig;
+}
+
+const char16* c16memchr(const char16* s, char16 c, size_t n) {
+ while (n-- > 0) {
+ if (*s == c) {
+ return s;
+ }
+ ++s;
+ }
+ return nullptr;
+}
+
+char16* c16memmove(char16* s1, const char16* s2, size_t n) {
+ return static_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
+ return static_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
+}
+
+char16* c16memset(char16* s, char16 c, size_t n) {
+ char16 *s_orig = s;
+ while (n-- > 0) {
+ *s = c;
+ ++s;
+ }
+ return s_orig;
+}
+
+namespace string16_internals {
+
+std::ostream& operator<<(std::ostream& out, const string16& str) {
+ return out << gurl_base::StringPiece16(str);
+}
+
+void PrintTo(const string16& str, std::ostream* out) {
+ *out << str;
+}
+
+} // namespace string16_internals
+
+} // namespace base
+
+template class std::
+ basic_string<gurl_base::char16, gurl_base::string16_internals::string16_char_traits>;
+
+#endif // WCHAR_T_IS_UTF32
diff --git a/base/strings/string16.h b/base/strings/string16.h
new file mode 100644
index 0000000..f17a57f
--- /dev/null
+++ b/base/strings/string16.h
@@ -0,0 +1,229 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING16_H_
+#define BASE_STRINGS_STRING16_H_
+
+// WHAT:
+// A version of std::basic_string that provides 2-byte characters even when
+// wchar_t is not implemented as a 2-byte type. You can access this class as
+// string16. We also define char16, which string16 is based upon.
+//
+// WHY:
+// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2
+// data. Plenty of existing code operates on strings encoded as UTF-16.
+//
+// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make
+// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails
+// at run time, because it calls some functions (like wcslen) that come from
+// the system's native C library -- which was built with a 4-byte wchar_t!
+// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's
+// entirely improper on those systems where the encoding of wchar_t is defined
+// as UTF-32.
+//
+// Here, we define string16, which is similar to std::wstring but replaces all
+// libc functions with custom, 2-byte-char compatible routines. It is capable
+// of carrying UTF-16-encoded data.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <functional>
+#include <string>
+
+#include "polyfills/base/base_export.h"
+#include "build/build_config.h"
+
+#if defined(WCHAR_T_IS_UTF16)
+
+// Define a macro for wrapping construction of char16 arrays and string16s from
+// a literal string. This indirection allows for an easier migration of
+// gurl_base::char16 to char16_t on platforms where WCHAR_T_IS_UTF16, as only a one
+// character change to the macro will be necessary.
+// This macro does not exist when WCHAR_T_IS_UTF32, as it is currently not
+// possible to create a char array form a literal in this case.
+// TODO(https://crbug.com/911896): Remove this macro once gurl_base::char16 is
+// char16_t on all platforms.
+#define STRING16_LITERAL(x) L##x
+
+namespace gurl_base {
+
+typedef wchar_t char16;
+typedef std::wstring string16;
+
+} // namespace base
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+#include <wchar.h> // for mbstate_t
+
+namespace gurl_base {
+
+typedef uint16_t char16;
+
+// char16 versions of the functions required by string16_char_traits; these
+// are based on the wide character functions of similar names ("w" or "wcs"
+// instead of "c16").
+BASE_EXPORT int c16memcmp(const char16* s1, const char16* s2, size_t n);
+BASE_EXPORT size_t c16len(const char16* s);
+BASE_EXPORT const char16* c16memchr(const char16* s, char16 c, size_t n);
+BASE_EXPORT char16* c16memmove(char16* s1, const char16* s2, size_t n);
+BASE_EXPORT char16* c16memcpy(char16* s1, const char16* s2, size_t n);
+BASE_EXPORT char16* c16memset(char16* s, char16 c, size_t n);
+
+// This namespace contains the implementation of gurl_base::string16 along with
+// things that need to be found via argument-dependent lookup from a
+// gurl_base::string16.
+namespace string16_internals {
+
+struct string16_char_traits {
+ typedef char16 char_type;
+ typedef int int_type;
+
+ // int_type needs to be able to hold each possible value of char_type, and in
+ // addition, the distinct value of eof().
+ static_assert(sizeof(int_type) > sizeof(char_type),
+ "int must be larger than 16 bits wide");
+
+ typedef std::streamoff off_type;
+ typedef mbstate_t state_type;
+ typedef std::fpos<state_type> pos_type;
+
+ static void assign(char_type& c1, const char_type& c2) {
+ c1 = c2;
+ }
+
+ static bool eq(const char_type& c1, const char_type& c2) {
+ return c1 == c2;
+ }
+ static bool lt(const char_type& c1, const char_type& c2) {
+ return c1 < c2;
+ }
+
+ static int compare(const char_type* s1, const char_type* s2, size_t n) {
+ return c16memcmp(s1, s2, n);
+ }
+
+ static size_t length(const char_type* s) {
+ return c16len(s);
+ }
+
+ static const char_type* find(const char_type* s, size_t n,
+ const char_type& a) {
+ return c16memchr(s, a, n);
+ }
+
+ static char_type* move(char_type* s1, const char_type* s2, size_t n) {
+ return c16memmove(s1, s2, n);
+ }
+
+ static char_type* copy(char_type* s1, const char_type* s2, size_t n) {
+ return c16memcpy(s1, s2, n);
+ }
+
+ static char_type* assign(char_type* s, size_t n, char_type a) {
+ return c16memset(s, a, n);
+ }
+
+ static int_type not_eof(const int_type& c) {
+ return eq_int_type(c, eof()) ? 0 : c;
+ }
+
+ static char_type to_char_type(const int_type& c) {
+ return char_type(c);
+ }
+
+ static int_type to_int_type(const char_type& c) {
+ return int_type(c);
+ }
+
+ static bool eq_int_type(const int_type& c1, const int_type& c2) {
+ return c1 == c2;
+ }
+
+ static int_type eof() {
+ return static_cast<int_type>(EOF);
+ }
+};
+
+} // namespace string16_internals
+
+typedef std::basic_string<char16,
+ gurl_base::string16_internals::string16_char_traits>
+ string16;
+
+namespace string16_internals {
+
+BASE_EXPORT extern std::ostream& operator<<(std::ostream& out,
+ const string16& str);
+
+// This is required by googletest to print a readable output on test failures.
+BASE_EXPORT extern void PrintTo(const string16& str, std::ostream* out);
+
+} // namespace string16_internals
+
+} // namespace base
+
+// The string class will be explicitly instantiated only once, in string16.cc.
+//
+// std::basic_string<> in GNU libstdc++ contains a static data member,
+// _S_empty_rep_storage, to represent empty strings. When an operation such
+// as assignment or destruction is performed on a string, causing its existing
+// data member to be invalidated, it must not be freed if this static data
+// member is being used. Otherwise, it counts as an attempt to free static
+// (and not allocated) data, which is a memory error.
+//
+// Generally, due to C++ template magic, _S_empty_rep_storage will be marked
+// as a coalesced symbol, meaning that the linker will combine multiple
+// instances into a single one when generating output.
+//
+// If a string class is used by multiple shared libraries, a problem occurs.
+// Each library will get its own copy of _S_empty_rep_storage. When strings
+// are passed across a library boundary for alteration or destruction, memory
+// errors will result. GNU libstdc++ contains a configuration option,
+// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which
+// disables the static data member optimization, but it's a good optimization
+// and non-STL code is generally at the mercy of the system's STL
+// configuration. Fully-dynamic strings are not the default for GNU libstdc++
+// libstdc++ itself or for the libstdc++ installations on the systems we care
+// about, such as Mac OS X and relevant flavors of Linux.
+//
+// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 .
+//
+// To avoid problems, string classes need to be explicitly instantiated only
+// once, in exactly one library. All other string users see it via an "extern"
+// declaration. This is precisely how GNU libstdc++ handles
+// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring).
+//
+// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2),
+// in which the linker does not fully coalesce symbols when dead code
+// stripping is enabled. This bug causes the memory errors described above
+// to occur even when a std::basic_string<> does not cross shared library
+// boundaries, such as in statically-linked executables.
+//
+// TODO(mark): File this bug with Apple and update this note with a bug number.
+
+extern template class BASE_EXPORT
+ std::basic_string<gurl_base::char16,
+ gurl_base::string16_internals::string16_char_traits>;
+
+// Specialize std::hash for gurl_base::string16. Although the style guide forbids
+// this in general, it is necessary for consistency with WCHAR_T_IS_UTF16
+// platforms, where gurl_base::string16 is a type alias for std::wstring.
+namespace std {
+template <>
+struct hash<gurl_base::string16> {
+ std::size_t operator()(const gurl_base::string16& s) const {
+ std::size_t result = 0;
+ for (gurl_base::char16 c : s)
+ result = (result * 131) + c;
+ return result;
+ }
+};
+} // namespace std
+
+#endif // WCHAR_T_IS_UTF32
+
+#endif // BASE_STRINGS_STRING16_H_
diff --git a/base/strings/string16_unittest.cc b/base/strings/string16_unittest.cc
new file mode 100644
index 0000000..a9aecef
--- /dev/null
+++ b/base/strings/string16_unittest.cc
@@ -0,0 +1,75 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <sstream>
+#include <unordered_set>
+
+#include "base/strings/string16.h"
+
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+#if defined(WCHAR_T_IS_UTF16)
+TEST(String16Test, String16Literal) {
+ static constexpr char16 kHelloWorld[] = STRING16_LITERAL("Hello, World");
+ string16 hello_world = kHelloWorld;
+ EXPECT_EQ(kHelloWorld, hello_world);
+}
+#endif
+
+// We define a custom operator<< for string16 so we can use it with logging.
+// This tests that conversion.
+TEST(String16Test, OutputStream) {
+ // Basic stream test.
+ {
+ std::ostringstream stream;
+ stream << "Empty '" << string16() << "' standard '"
+ << string16(ASCIIToUTF16("Hello, world")) << "'";
+ EXPECT_STREQ("Empty '' standard 'Hello, world'",
+ stream.str().c_str());
+ }
+
+ // Interesting edge cases.
+ {
+ // These should each get converted to the invalid character: EF BF BD.
+ string16 initial_surrogate;
+ initial_surrogate.push_back(0xd800);
+ string16 final_surrogate;
+ final_surrogate.push_back(0xdc00);
+
+ // Old italic A = U+10300, will get converted to: F0 90 8C 80 'z'.
+ string16 surrogate_pair;
+ surrogate_pair.push_back(0xd800);
+ surrogate_pair.push_back(0xdf00);
+ surrogate_pair.push_back('z');
+
+ // Will get converted to the invalid char + 's': EF BF BD 's'.
+ string16 unterminated_surrogate;
+ unterminated_surrogate.push_back(0xd800);
+ unterminated_surrogate.push_back('s');
+
+ std::ostringstream stream;
+ stream << initial_surrogate << "," << final_surrogate << ","
+ << surrogate_pair << "," << unterminated_surrogate;
+
+ EXPECT_STREQ("\xef\xbf\xbd,\xef\xbf\xbd,\xf0\x90\x8c\x80z,\xef\xbf\xbds",
+ stream.str().c_str());
+ }
+}
+
+TEST(String16Test, Hash) {
+ string16 str1 = ASCIIToUTF16("hello");
+ string16 str2 = ASCIIToUTF16("world");
+
+ std::unordered_set<string16> set;
+
+ set.insert(str1);
+ EXPECT_EQ(1u, set.count(str1));
+ EXPECT_EQ(0u, set.count(str2));
+}
+
+} // namespace base
diff --git a/base/strings/string_number_conversions.cc b/base/strings/string_number_conversions.cc
new file mode 100644
index 0000000..2bf6142
--- /dev/null
+++ b/base/strings/string_number_conversions.cc
@@ -0,0 +1,505 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_number_conversions.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <wctype.h>
+
+#include <limits>
+#include <type_traits>
+
+#include "polyfills/base/logging.h"
+#include "base/numerics/safe_math.h"
+#include "base/scoped_clear_last_error.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/third_party/dmg_fp/dmg_fp.h"
+
+namespace gurl_base {
+
+namespace {
+
+template <typename STR, typename INT>
+struct IntToStringT {
+ static STR IntToString(INT value) {
+ // log10(2) ~= 0.3 bytes needed per bit or per byte log10(2**8) ~= 2.4.
+ // So round up to allocate 3 output characters per byte, plus 1 for '-'.
+ const size_t kOutputBufSize =
+ 3 * sizeof(INT) + std::numeric_limits<INT>::is_signed;
+
+ // Create the string in a temporary buffer, write it back to front, and
+ // then return the substr of what we ended up using.
+ using CHR = typename STR::value_type;
+ CHR outbuf[kOutputBufSize];
+
+ // The ValueOrDie call below can never fail, because UnsignedAbs is valid
+ // for all valid inputs.
+ typename std::make_unsigned<INT>::type res =
+ CheckedNumeric<INT>(value).UnsignedAbs().ValueOrDie();
+
+ CHR* end = outbuf + kOutputBufSize;
+ CHR* i = end;
+ do {
+ --i;
+ GURL_DCHECK(i != outbuf);
+ *i = static_cast<CHR>((res % 10) + '0');
+ res /= 10;
+ } while (res != 0);
+ if (IsValueNegative(value)) {
+ --i;
+ GURL_DCHECK(i != outbuf);
+ *i = static_cast<CHR>('-');
+ }
+ return STR(i, end);
+ }
+};
+
+// Utility to convert a character to a digit in a given base
+template<typename CHAR, int BASE, bool BASE_LTE_10> class BaseCharToDigit {
+};
+
+// Faster specialization for bases <= 10
+template<typename CHAR, int BASE> class BaseCharToDigit<CHAR, BASE, true> {
+ public:
+ static bool Convert(CHAR c, uint8_t* digit) {
+ if (c >= '0' && c < '0' + BASE) {
+ *digit = static_cast<uint8_t>(c - '0');
+ return true;
+ }
+ return false;
+ }
+};
+
+// Specialization for bases where 10 < base <= 36
+template<typename CHAR, int BASE> class BaseCharToDigit<CHAR, BASE, false> {
+ public:
+ static bool Convert(CHAR c, uint8_t* digit) {
+ if (c >= '0' && c <= '9') {
+ *digit = c - '0';
+ } else if (c >= 'a' && c < 'a' + BASE - 10) {
+ *digit = c - 'a' + 10;
+ } else if (c >= 'A' && c < 'A' + BASE - 10) {
+ *digit = c - 'A' + 10;
+ } else {
+ return false;
+ }
+ return true;
+ }
+};
+
+template <int BASE, typename CHAR>
+bool CharToDigit(CHAR c, uint8_t* digit) {
+ return BaseCharToDigit<CHAR, BASE, BASE <= 10>::Convert(c, digit);
+}
+
+// There is an IsUnicodeWhitespace for wchars defined in string_util.h, but it
+// is locale independent, whereas the functions we are replacing were
+// locale-dependent. TBD what is desired, but for the moment let's not
+// introduce a change in behaviour.
+template<typename CHAR> class WhitespaceHelper {
+};
+
+template<> class WhitespaceHelper<char> {
+ public:
+ static bool Invoke(char c) {
+ return 0 != isspace(static_cast<unsigned char>(c));
+ }
+};
+
+template<> class WhitespaceHelper<char16> {
+ public:
+ static bool Invoke(char16 c) {
+ return 0 != iswspace(c);
+ }
+};
+
+template<typename CHAR> bool LocalIsWhitespace(CHAR c) {
+ return WhitespaceHelper<CHAR>::Invoke(c);
+}
+
+// IteratorRangeToNumberTraits should provide:
+// - a typedef for iterator_type, the iterator type used as input.
+// - a typedef for value_type, the target numeric type.
+// - static functions min, max (returning the minimum and maximum permitted
+// values)
+// - constant kBase, the base in which to interpret the input
+template<typename IteratorRangeToNumberTraits>
+class IteratorRangeToNumber {
+ public:
+ typedef IteratorRangeToNumberTraits traits;
+ typedef typename traits::iterator_type const_iterator;
+ typedef typename traits::value_type value_type;
+
+ // Generalized iterator-range-to-number conversion.
+ //
+ static bool Invoke(const_iterator begin,
+ const_iterator end,
+ value_type* output) {
+ bool valid = true;
+
+ while (begin != end && LocalIsWhitespace(*begin)) {
+ valid = false;
+ ++begin;
+ }
+
+ if (begin != end && *begin == '-') {
+ if (!std::numeric_limits<value_type>::is_signed) {
+ *output = 0;
+ valid = false;
+ } else if (!Negative::Invoke(begin + 1, end, output)) {
+ valid = false;
+ }
+ } else {
+ if (begin != end && *begin == '+') {
+ ++begin;
+ }
+ if (!Positive::Invoke(begin, end, output)) {
+ valid = false;
+ }
+ }
+
+ return valid;
+ }
+
+ private:
+ // Sign provides:
+ // - a static function, CheckBounds, that determines whether the next digit
+ // causes an overflow/underflow
+ // - a static function, Increment, that appends the next digit appropriately
+ // according to the sign of the number being parsed.
+ template<typename Sign>
+ class Base {
+ public:
+ static bool Invoke(const_iterator begin, const_iterator end,
+ typename traits::value_type* output) {
+ *output = 0;
+
+ if (begin == end) {
+ return false;
+ }
+
+ // Note: no performance difference was found when using template
+ // specialization to remove this check in bases other than 16
+ if (traits::kBase == 16 && end - begin > 2 && *begin == '0' &&
+ (*(begin + 1) == 'x' || *(begin + 1) == 'X')) {
+ begin += 2;
+ }
+
+ for (const_iterator current = begin; current != end; ++current) {
+ uint8_t new_digit = 0;
+
+ if (!CharToDigit<traits::kBase>(*current, &new_digit)) {
+ return false;
+ }
+
+ if (current != begin) {
+ if (!Sign::CheckBounds(output, new_digit)) {
+ return false;
+ }
+ *output *= traits::kBase;
+ }
+
+ Sign::Increment(new_digit, output);
+ }
+ return true;
+ }
+ };
+
+ class Positive : public Base<Positive> {
+ public:
+ static bool CheckBounds(value_type* output, uint8_t new_digit) {
+ if (*output > static_cast<value_type>(traits::max() / traits::kBase) ||
+ (*output == static_cast<value_type>(traits::max() / traits::kBase) &&
+ new_digit > traits::max() % traits::kBase)) {
+ *output = traits::max();
+ return false;
+ }
+ return true;
+ }
+ static void Increment(uint8_t increment, value_type* output) {
+ *output += increment;
+ }
+ };
+
+ class Negative : public Base<Negative> {
+ public:
+ static bool CheckBounds(value_type* output, uint8_t new_digit) {
+ if (*output < traits::min() / traits::kBase ||
+ (*output == traits::min() / traits::kBase &&
+ new_digit > 0 - traits::min() % traits::kBase)) {
+ *output = traits::min();
+ return false;
+ }
+ return true;
+ }
+ static void Increment(uint8_t increment, value_type* output) {
+ *output -= increment;
+ }
+ };
+};
+
+template<typename ITERATOR, typename VALUE, int BASE>
+class BaseIteratorRangeToNumberTraits {
+ public:
+ typedef ITERATOR iterator_type;
+ typedef VALUE value_type;
+ static value_type min() {
+ return std::numeric_limits<value_type>::min();
+ }
+ static value_type max() {
+ return std::numeric_limits<value_type>::max();
+ }
+ static const int kBase = BASE;
+};
+
+template<typename ITERATOR>
+class BaseHexIteratorRangeToIntTraits
+ : public BaseIteratorRangeToNumberTraits<ITERATOR, int, 16> {
+};
+
+template <typename ITERATOR>
+class BaseHexIteratorRangeToUIntTraits
+ : public BaseIteratorRangeToNumberTraits<ITERATOR, uint32_t, 16> {};
+
+template <typename ITERATOR>
+class BaseHexIteratorRangeToInt64Traits
+ : public BaseIteratorRangeToNumberTraits<ITERATOR, int64_t, 16> {};
+
+template <typename ITERATOR>
+class BaseHexIteratorRangeToUInt64Traits
+ : public BaseIteratorRangeToNumberTraits<ITERATOR, uint64_t, 16> {};
+
+typedef BaseHexIteratorRangeToIntTraits<StringPiece::const_iterator>
+ HexIteratorRangeToIntTraits;
+
+typedef BaseHexIteratorRangeToUIntTraits<StringPiece::const_iterator>
+ HexIteratorRangeToUIntTraits;
+
+typedef BaseHexIteratorRangeToInt64Traits<StringPiece::const_iterator>
+ HexIteratorRangeToInt64Traits;
+
+typedef BaseHexIteratorRangeToUInt64Traits<StringPiece::const_iterator>
+ HexIteratorRangeToUInt64Traits;
+
+template <typename VALUE, int BASE>
+class StringPieceToNumberTraits
+ : public BaseIteratorRangeToNumberTraits<StringPiece::const_iterator,
+ VALUE,
+ BASE> {
+};
+
+template <typename VALUE>
+bool StringToIntImpl(StringPiece input, VALUE* output) {
+ return IteratorRangeToNumber<StringPieceToNumberTraits<VALUE, 10> >::Invoke(
+ input.begin(), input.end(), output);
+}
+
+template <typename VALUE, int BASE>
+class StringPiece16ToNumberTraits
+ : public BaseIteratorRangeToNumberTraits<StringPiece16::const_iterator,
+ VALUE,
+ BASE> {
+};
+
+template <typename VALUE>
+bool String16ToIntImpl(StringPiece16 input, VALUE* output) {
+ return IteratorRangeToNumber<StringPiece16ToNumberTraits<VALUE, 10> >::Invoke(
+ input.begin(), input.end(), output);
+}
+
+} // namespace
+
+std::string NumberToString(int value) {
+ return IntToStringT<std::string, int>::IntToString(value);
+}
+
+string16 NumberToString16(int value) {
+ return IntToStringT<string16, int>::IntToString(value);
+}
+
+std::string NumberToString(unsigned value) {
+ return IntToStringT<std::string, unsigned>::IntToString(value);
+}
+
+string16 NumberToString16(unsigned value) {
+ return IntToStringT<string16, unsigned>::IntToString(value);
+}
+
+std::string NumberToString(long value) {
+ return IntToStringT<std::string, long>::IntToString(value);
+}
+
+string16 NumberToString16(long value) {
+ return IntToStringT<string16, long>::IntToString(value);
+}
+
+std::string NumberToString(unsigned long value) {
+ return IntToStringT<std::string, unsigned long>::IntToString(value);
+}
+
+string16 NumberToString16(unsigned long value) {
+ return IntToStringT<string16, unsigned long>::IntToString(value);
+}
+
+std::string NumberToString(long long value) {
+ return IntToStringT<std::string, long long>::IntToString(value);
+}
+
+string16 NumberToString16(long long value) {
+ return IntToStringT<string16, long long>::IntToString(value);
+}
+
+std::string NumberToString(unsigned long long value) {
+ return IntToStringT<std::string, unsigned long long>::IntToString(value);
+}
+
+string16 NumberToString16(unsigned long long value) {
+ return IntToStringT<string16, unsigned long long>::IntToString(value);
+}
+
+std::string NumberToString(double value) {
+ // According to g_fmt.cc, it is sufficient to declare a buffer of size 32.
+ char buffer[32];
+ dmg_fp::g_fmt(buffer, value);
+ return std::string(buffer);
+}
+
+gurl_base::string16 NumberToString16(double value) {
+ // According to g_fmt.cc, it is sufficient to declare a buffer of size 32.
+ char buffer[32];
+ dmg_fp::g_fmt(buffer, value);
+
+ // The number will be ASCII. This creates the string using the "input
+ // iterator" variant which promotes from 8-bit to 16-bit via "=".
+ return gurl_base::string16(&buffer[0], &buffer[strlen(buffer)]);
+}
+
+bool StringToInt(StringPiece input, int* output) {
+ return StringToIntImpl(input, output);
+}
+
+bool StringToInt(StringPiece16 input, int* output) {
+ return String16ToIntImpl(input, output);
+}
+
+bool StringToUint(StringPiece input, unsigned* output) {
+ return StringToIntImpl(input, output);
+}
+
+bool StringToUint(StringPiece16 input, unsigned* output) {
+ return String16ToIntImpl(input, output);
+}
+
+bool StringToInt64(StringPiece input, int64_t* output) {
+ return StringToIntImpl(input, output);
+}
+
+bool StringToInt64(StringPiece16 input, int64_t* output) {
+ return String16ToIntImpl(input, output);
+}
+
+bool StringToUint64(StringPiece input, uint64_t* output) {
+ return StringToIntImpl(input, output);
+}
+
+bool StringToUint64(StringPiece16 input, uint64_t* output) {
+ return String16ToIntImpl(input, output);
+}
+
+bool StringToSizeT(StringPiece input, size_t* output) {
+ return StringToIntImpl(input, output);
+}
+
+bool StringToSizeT(StringPiece16 input, size_t* output) {
+ return String16ToIntImpl(input, output);
+}
+
+bool StringToDouble(const std::string& input, double* output) {
+ // Thread-safe? It is on at least Mac, Linux, and Windows.
+ internal::ScopedClearLastError clear_errno;
+
+ char* endptr = nullptr;
+ *output = dmg_fp::strtod(input.c_str(), &endptr);
+
+ // Cases to return false:
+ // - If errno is ERANGE, there was an overflow or underflow.
+ // - If the input string is empty, there was nothing to parse.
+ // - If endptr does not point to the end of the string, there are either
+ // characters remaining in the string after a parsed number, or the string
+ // does not begin with a parseable number. endptr is compared to the
+ // expected end given the string's stated length to correctly catch cases
+ // where the string contains embedded NUL characters.
+ // - If the first character is a space, there was leading whitespace
+ return errno == 0 &&
+ !input.empty() &&
+ input.c_str() + input.length() == endptr &&
+ !isspace(input[0]);
+}
+
+// Note: if you need to add String16ToDouble, first ask yourself if it's
+// really necessary. If it is, probably the best implementation here is to
+// convert to 8-bit and then use the 8-bit version.
+
+// Note: if you need to add an iterator range version of StringToDouble, first
+// ask yourself if it's really necessary. If it is, probably the best
+// implementation here is to instantiate a string and use the string version.
+
+std::string HexEncode(const void* bytes, size_t size) {
+ static const char kHexChars[] = "0123456789ABCDEF";
+
+ // Each input byte creates two output hex characters.
+ std::string ret(size * 2, '\0');
+
+ for (size_t i = 0; i < size; ++i) {
+ char b = reinterpret_cast<const char*>(bytes)[i];
+ ret[(i * 2)] = kHexChars[(b >> 4) & 0xf];
+ ret[(i * 2) + 1] = kHexChars[b & 0xf];
+ }
+ return ret;
+}
+
+std::string HexEncode(gurl_base::span<const uint8_t> bytes) {
+ return HexEncode(bytes.data(), bytes.size());
+}
+
+bool HexStringToInt(StringPiece input, int* output) {
+ return IteratorRangeToNumber<HexIteratorRangeToIntTraits>::Invoke(
+ input.begin(), input.end(), output);
+}
+
+bool HexStringToUInt(StringPiece input, uint32_t* output) {
+ return IteratorRangeToNumber<HexIteratorRangeToUIntTraits>::Invoke(
+ input.begin(), input.end(), output);
+}
+
+bool HexStringToInt64(StringPiece input, int64_t* output) {
+ return IteratorRangeToNumber<HexIteratorRangeToInt64Traits>::Invoke(
+ input.begin(), input.end(), output);
+}
+
+bool HexStringToUInt64(StringPiece input, uint64_t* output) {
+ return IteratorRangeToNumber<HexIteratorRangeToUInt64Traits>::Invoke(
+ input.begin(), input.end(), output);
+}
+
+bool HexStringToBytes(StringPiece input, std::vector<uint8_t>* output) {
+ GURL_DCHECK_EQ(output->size(), 0u);
+ size_t count = input.size();
+ if (count == 0 || (count % 2) != 0)
+ return false;
+ for (uintptr_t i = 0; i < count / 2; ++i) {
+ uint8_t msb = 0; // most significant 4 bits
+ uint8_t lsb = 0; // least significant 4 bits
+ if (!CharToDigit<16>(input[i * 2], &msb) ||
+ !CharToDigit<16>(input[i * 2 + 1], &lsb)) {
+ return false;
+ }
+ output->push_back((msb << 4) | lsb);
+ }
+ return true;
+}
+
+} // namespace base
diff --git a/base/strings/string_number_conversions.h b/base/strings/string_number_conversions.h
new file mode 100644
index 0000000..a3acab8
--- /dev/null
+++ b/base/strings/string_number_conversions.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_
+#define BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "polyfills/base/base_export.h"
+#include "base/containers/span.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "build/build_config.h"
+
+// ----------------------------------------------------------------------------
+// IMPORTANT MESSAGE FROM YOUR SPONSOR
+//
+// This file contains no "wstring" variants. New code should use string16. If
+// you need to make old code work, use the UTF8 version and convert. Please do
+// not add wstring variants.
+//
+// Please do not add "convenience" functions for converting strings to integers
+// that return the value and ignore success/failure. That encourages people to
+// write code that doesn't properly handle the error conditions.
+//
+// DO NOT use these functions in any UI unless it's NOT localized on purpose.
+// Instead, use gurl_base::MessageFormatter for a complex message with numbers
+// (integer, float, double) embedded or gurl_base::Format{Number,Double,Percent} to
+// just format a single number/percent. Note that some languages use native
+// digits instead of ASCII digits while others use a group separator or decimal
+// point different from ',' and '.'. Using these functions in the UI would lead
+// numbers to be formatted in a non-native way.
+// ----------------------------------------------------------------------------
+
+namespace gurl_base {
+
+// Number -> string conversions ------------------------------------------------
+
+// Ignores locale! see warning above.
+BASE_EXPORT std::string NumberToString(int value);
+BASE_EXPORT string16 NumberToString16(int value);
+BASE_EXPORT std::string NumberToString(unsigned int value);
+BASE_EXPORT string16 NumberToString16(unsigned int value);
+BASE_EXPORT std::string NumberToString(long value);
+BASE_EXPORT string16 NumberToString16(long value);
+BASE_EXPORT std::string NumberToString(unsigned long value);
+BASE_EXPORT string16 NumberToString16(unsigned long value);
+BASE_EXPORT std::string NumberToString(long long value);
+BASE_EXPORT string16 NumberToString16(long long value);
+BASE_EXPORT std::string NumberToString(unsigned long long value);
+BASE_EXPORT string16 NumberToString16(unsigned long long value);
+BASE_EXPORT std::string NumberToString(double value);
+BASE_EXPORT string16 NumberToString16(double value);
+
+// String -> number conversions ------------------------------------------------
+
+// Perform a best-effort conversion of the input string to a numeric type,
+// setting |*output| to the result of the conversion. Returns true for
+// "perfect" conversions; returns false in the following cases:
+// - Overflow. |*output| will be set to the maximum value supported
+// by the data type.
+// - Underflow. |*output| will be set to the minimum value supported
+// by the data type.
+// - Trailing characters in the string after parsing the number. |*output|
+// will be set to the value of the number that was parsed.
+// - Leading whitespace in the string before parsing the number. |*output| will
+// be set to the value of the number that was parsed.
+// - No characters parseable as a number at the beginning of the string.
+// |*output| will be set to 0.
+// - Empty string. |*output| will be set to 0.
+// WARNING: Will write to |output| even when returning false.
+// Read the comments above carefully.
+BASE_EXPORT bool StringToInt(StringPiece input, int* output);
+BASE_EXPORT bool StringToInt(StringPiece16 input, int* output);
+
+BASE_EXPORT bool StringToUint(StringPiece input, unsigned* output);
+BASE_EXPORT bool StringToUint(StringPiece16 input, unsigned* output);
+
+BASE_EXPORT bool StringToInt64(StringPiece input, int64_t* output);
+BASE_EXPORT bool StringToInt64(StringPiece16 input, int64_t* output);
+
+BASE_EXPORT bool StringToUint64(StringPiece input, uint64_t* output);
+BASE_EXPORT bool StringToUint64(StringPiece16 input, uint64_t* output);
+
+BASE_EXPORT bool StringToSizeT(StringPiece input, size_t* output);
+BASE_EXPORT bool StringToSizeT(StringPiece16 input, size_t* output);
+
+// For floating-point conversions, only conversions of input strings in decimal
+// form are defined to work. Behavior with strings representing floating-point
+// numbers in hexadecimal, and strings representing non-finite values (such as
+// NaN and inf) is undefined. Otherwise, these behave the same as the integral
+// variants. This expects the input string to NOT be specific to the locale.
+// If your input is locale specific, use ICU to read the number.
+// WARNING: Will write to |output| even when returning false.
+// Read the comments here and above StringToInt() carefully.
+BASE_EXPORT bool StringToDouble(const std::string& input, double* output);
+
+// Hex encoding ----------------------------------------------------------------
+
+// Returns a hex string representation of a binary buffer. The returned hex
+// string will be in upper case. This function does not check if |size| is
+// within reasonable limits since it's written with trusted data in mind. If
+// you suspect that the data you want to format might be large, the absolute
+// max size for |size| should be is
+// std::numeric_limits<size_t>::max() / 2
+BASE_EXPORT std::string HexEncode(const void* bytes, size_t size);
+BASE_EXPORT std::string HexEncode(gurl_base::span<const uint8_t> bytes);
+
+// Best effort conversion, see StringToInt above for restrictions.
+// Will only successful parse hex values that will fit into |output|, i.e.
+// -0x80000000 < |input| < 0x7FFFFFFF.
+BASE_EXPORT bool HexStringToInt(StringPiece input, int* output);
+
+// Best effort conversion, see StringToInt above for restrictions.
+// Will only successful parse hex values that will fit into |output|, i.e.
+// 0x00000000 < |input| < 0xFFFFFFFF.
+// The string is not required to start with 0x.
+BASE_EXPORT bool HexStringToUInt(StringPiece input, uint32_t* output);
+
+// Best effort conversion, see StringToInt above for restrictions.
+// Will only successful parse hex values that will fit into |output|, i.e.
+// -0x8000000000000000 < |input| < 0x7FFFFFFFFFFFFFFF.
+BASE_EXPORT bool HexStringToInt64(StringPiece input, int64_t* output);
+
+// Best effort conversion, see StringToInt above for restrictions.
+// Will only successful parse hex values that will fit into |output|, i.e.
+// 0x0000000000000000 < |input| < 0xFFFFFFFFFFFFFFFF.
+// The string is not required to start with 0x.
+BASE_EXPORT bool HexStringToUInt64(StringPiece input, uint64_t* output);
+
+// Similar to the previous functions, except that output is a vector of bytes.
+// |*output| will contain as many bytes as were successfully parsed prior to the
+// error. There is no overflow, but input.size() must be evenly divisible by 2.
+// Leading 0x or +/- are not allowed.
+BASE_EXPORT bool HexStringToBytes(StringPiece input,
+ std::vector<uint8_t>* output);
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_NUMBER_CONVERSIONS_H_
diff --git a/base/strings/string_number_conversions_fuzzer.cc b/base/strings/string_number_conversions_fuzzer.cc
new file mode 100644
index 0000000..012887a
--- /dev/null
+++ b/base/strings/string_number_conversions_fuzzer.cc
@@ -0,0 +1,118 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "base/strings/string_number_conversions.h"
+
+template <class NumberType, class StringPieceType, class StringType>
+void CheckRoundtripsT(const uint8_t* data,
+ const size_t size,
+ StringType (*num_to_string)(NumberType),
+ bool (*string_to_num)(StringPieceType, NumberType*)) {
+ // Ensure we can read a NumberType from |data|
+ if (size < sizeof(NumberType))
+ return;
+ const NumberType v1 = *reinterpret_cast<const NumberType*>(data);
+
+ // Because we started with an arbitrary NumberType value, not an arbitrary
+ // string, we expect that the function |string_to_num| (e.g. StringToInt) will
+ // return true, indicating a perfect conversion.
+ NumberType v2;
+ GURL_CHECK(string_to_num(num_to_string(v1), &v2));
+
+ // Given that this was a perfect conversion, we expect the original NumberType
+ // value to equal the newly parsed one.
+ GURL_CHECK_EQ(v1, v2);
+}
+
+template <class NumberType>
+void CheckRoundtrips(const uint8_t* data,
+ const size_t size,
+ bool (*string_to_num)(gurl_base::StringPiece, NumberType*)) {
+ return CheckRoundtripsT<NumberType, gurl_base::StringPiece, std::string>(
+ data, size, &gurl_base::NumberToString, string_to_num);
+}
+
+template <class NumberType>
+void CheckRoundtrips16(const uint8_t* data,
+ const size_t size,
+ bool (*string_to_num)(gurl_base::StringPiece16,
+ NumberType*)) {
+ return CheckRoundtripsT<NumberType, gurl_base::StringPiece16, gurl_base::string16>(
+ data, size, &gurl_base::NumberToString16, string_to_num);
+}
+
+// Entry point for LibFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ // For each instantiation of NumberToString f and its corresponding StringTo*
+ // function g, check that f(g(x)) = x holds for fuzzer-determined values of x.
+ CheckRoundtrips<int>(data, size, &gurl_base::StringToInt);
+ CheckRoundtrips16<int>(data, size, &gurl_base::StringToInt);
+ CheckRoundtrips<unsigned int>(data, size, &gurl_base::StringToUint);
+ CheckRoundtrips16<unsigned int>(data, size, &gurl_base::StringToUint);
+ CheckRoundtrips<int64_t>(data, size, &gurl_base::StringToInt64);
+ CheckRoundtrips16<int64_t>(data, size, &gurl_base::StringToInt64);
+ CheckRoundtrips<uint64_t>(data, size, &gurl_base::StringToUint64);
+ CheckRoundtrips16<uint64_t>(data, size, &gurl_base::StringToUint64);
+ CheckRoundtrips<size_t>(data, size, &gurl_base::StringToSizeT);
+ CheckRoundtrips16<size_t>(data, size, &gurl_base::StringToSizeT);
+
+ gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data),
+ size);
+ std::string string_input(reinterpret_cast<const char*>(data), size);
+
+ int out_int;
+ gurl_base::StringToInt(string_piece_input, &out_int);
+ unsigned out_uint;
+ gurl_base::StringToUint(string_piece_input, &out_uint);
+ int64_t out_int64;
+ gurl_base::StringToInt64(string_piece_input, &out_int64);
+ uint64_t out_uint64;
+ gurl_base::StringToUint64(string_piece_input, &out_uint64);
+ size_t out_size;
+ gurl_base::StringToSizeT(string_piece_input, &out_size);
+
+ // Test for StringPiece16 if size is even.
+ if (size % 2 == 0) {
+ gurl_base::StringPiece16 string_piece_input16(
+ reinterpret_cast<const gurl_base::char16*>(data), size / 2);
+
+ gurl_base::StringToInt(string_piece_input16, &out_int);
+ gurl_base::StringToUint(string_piece_input16, &out_uint);
+ gurl_base::StringToInt64(string_piece_input16, &out_int64);
+ gurl_base::StringToUint64(string_piece_input16, &out_uint64);
+ gurl_base::StringToSizeT(string_piece_input16, &out_size);
+ }
+
+ double out_double;
+ gurl_base::StringToDouble(string_input, &out_double);
+
+ gurl_base::HexStringToInt(string_piece_input, &out_int);
+ gurl_base::HexStringToUInt(string_piece_input, &out_uint);
+ gurl_base::HexStringToInt64(string_piece_input, &out_int64);
+ gurl_base::HexStringToUInt64(string_piece_input, &out_uint64);
+ std::vector<uint8_t> out_bytes;
+ gurl_base::HexStringToBytes(string_piece_input, &out_bytes);
+
+ gurl_base::HexEncode(data, size);
+
+ // Convert the numbers back to strings.
+ gurl_base::NumberToString(out_int);
+ gurl_base::NumberToString16(out_int);
+ gurl_base::NumberToString(out_uint);
+ gurl_base::NumberToString16(out_uint);
+ gurl_base::NumberToString(out_int64);
+ gurl_base::NumberToString16(out_int64);
+ gurl_base::NumberToString(out_uint64);
+ gurl_base::NumberToString16(out_uint64);
+ gurl_base::NumberToString(out_double);
+ gurl_base::NumberToString16(out_double);
+
+ return 0;
+}
diff --git a/base/strings/string_number_conversions_unittest.cc b/base/strings/string_number_conversions_unittest.cc
new file mode 100644
index 0000000..93405e2
--- /dev/null
+++ b/base/strings/string_number_conversions_unittest.cc
@@ -0,0 +1,901 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_number_conversions.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <cmath>
+#include <limits>
+
+#include "base/bit_cast.h"
+#include "base/format_macros.h"
+#include "base/stl_util.h"
+#include "base/strings/stringprintf.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+namespace {
+
+template <typename INT>
+struct NumberToStringTest {
+ INT num;
+ const char* sexpected;
+ const char* uexpected;
+};
+
+} // namespace
+
+TEST(StringNumberConversionsTest, NumberToString) {
+ static const NumberToStringTest<int> int_tests[] = {
+ {0, "0", "0"},
+ {-1, "-1", "4294967295"},
+ {std::numeric_limits<int>::max(), "2147483647", "2147483647"},
+ {std::numeric_limits<int>::min(), "-2147483648", "2147483648"},
+ };
+ static const NumberToStringTest<int64_t> int64_tests[] = {
+ {0, "0", "0"},
+ {-1, "-1", "18446744073709551615"},
+ {
+ std::numeric_limits<int64_t>::max(), "9223372036854775807",
+ "9223372036854775807",
+ },
+ {std::numeric_limits<int64_t>::min(), "-9223372036854775808",
+ "9223372036854775808"},
+ };
+
+ for (const auto& test : int_tests) {
+ EXPECT_EQ(NumberToString(test.num), test.sexpected);
+ EXPECT_EQ(NumberToString16(test.num), UTF8ToUTF16(test.sexpected));
+ EXPECT_EQ(NumberToString(static_cast<unsigned>(test.num)), test.uexpected);
+ EXPECT_EQ(NumberToString16(static_cast<unsigned>(test.num)),
+ UTF8ToUTF16(test.uexpected));
+ }
+ for (const auto& test : int64_tests) {
+ EXPECT_EQ(NumberToString(test.num), test.sexpected);
+ EXPECT_EQ(NumberToString16(test.num), UTF8ToUTF16(test.sexpected));
+ EXPECT_EQ(NumberToString(static_cast<uint64_t>(test.num)), test.uexpected);
+ EXPECT_EQ(NumberToString16(static_cast<uint64_t>(test.num)),
+ UTF8ToUTF16(test.uexpected));
+ }
+}
+
+TEST(StringNumberConversionsTest, Uint64ToString) {
+ static const struct {
+ uint64_t input;
+ std::string output;
+ } cases[] = {
+ {0, "0"},
+ {42, "42"},
+ {INT_MAX, "2147483647"},
+ {std::numeric_limits<uint64_t>::max(), "18446744073709551615"},
+ };
+
+ for (const auto& i : cases)
+ EXPECT_EQ(i.output, NumberToString(i.input));
+}
+
+TEST(StringNumberConversionsTest, SizeTToString) {
+ size_t size_t_max = std::numeric_limits<size_t>::max();
+ std::string size_t_max_string = StringPrintf("%" PRIuS, size_t_max);
+
+ static const struct {
+ size_t input;
+ std::string output;
+ } cases[] = {
+ {0, "0"},
+ {9, "9"},
+ {42, "42"},
+ {INT_MAX, "2147483647"},
+ {2147483648U, "2147483648"},
+#if SIZE_MAX > 4294967295U
+ {99999999999U, "99999999999"},
+#endif
+ {size_t_max, size_t_max_string},
+ };
+
+ for (const auto& i : cases)
+ EXPECT_EQ(i.output, NumberToString(i.input));
+}
+
+TEST(StringNumberConversionsTest, StringToInt) {
+ static const struct {
+ std::string input;
+ int output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 42, true},
+ {"42\x99", 42, false},
+ {"\x99" "42\x99", 0, false},
+ {"-2147483648", INT_MIN, true},
+ {"2147483647", INT_MAX, true},
+ {"", 0, false},
+ {" 42", 42, false},
+ {"42 ", 42, false},
+ {"\t\n\v\f\r 42", 42, false},
+ {"blah42", 0, false},
+ {"42blah", 42, false},
+ {"blah42blah", 0, false},
+ {"-273.15", -273, false},
+ {"+98.6", 98, false},
+ {"--123", 0, false},
+ {"++123", 0, false},
+ {"-+123", 0, false},
+ {"+-123", 0, false},
+ {"-", 0, false},
+ {"-2147483649", INT_MIN, false},
+ {"-99999999999", INT_MIN, false},
+ {"2147483648", INT_MAX, false},
+ {"99999999999", INT_MAX, false},
+ };
+
+ for (const auto& i : cases) {
+ int output = i.output ^ 1; // Ensure StringToInt wrote something.
+ EXPECT_EQ(i.success, StringToInt(i.input, &output));
+ EXPECT_EQ(i.output, output);
+
+ string16 utf16_input = UTF8ToUTF16(i.input);
+ output = i.output ^ 1; // Ensure StringToInt wrote something.
+ EXPECT_EQ(i.success, StringToInt(utf16_input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "6\06";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ int output;
+ EXPECT_FALSE(StringToInt(input_string, &output));
+ EXPECT_EQ(6, output);
+
+ string16 utf16_input = UTF8ToUTF16(input_string);
+ output = 0;
+ EXPECT_FALSE(StringToInt(utf16_input, &output));
+ EXPECT_EQ(6, output);
+
+ output = 0;
+ const char16 negative_wide_input[] = { 0xFF4D, '4', '2', 0};
+ EXPECT_FALSE(StringToInt(string16(negative_wide_input), &output));
+ EXPECT_EQ(0, output);
+}
+
+TEST(StringNumberConversionsTest, StringToUint) {
+ static const struct {
+ std::string input;
+ unsigned output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 42, true},
+ {"42\x99", 42, false},
+ {"\x99" "42\x99", 0, false},
+ {"-2147483648", 0, false},
+ {"2147483647", INT_MAX, true},
+ {"", 0, false},
+ {" 42", 42, false},
+ {"42 ", 42, false},
+ {"\t\n\v\f\r 42", 42, false},
+ {"blah42", 0, false},
+ {"42blah", 42, false},
+ {"blah42blah", 0, false},
+ {"-273.15", 0, false},
+ {"+98.6", 98, false},
+ {"--123", 0, false},
+ {"++123", 0, false},
+ {"-+123", 0, false},
+ {"+-123", 0, false},
+ {"-", 0, false},
+ {"-2147483649", 0, false},
+ {"-99999999999", 0, false},
+ {"4294967295", UINT_MAX, true},
+ {"4294967296", UINT_MAX, false},
+ {"99999999999", UINT_MAX, false},
+ };
+
+ for (const auto& i : cases) {
+ unsigned output = i.output ^ 1; // Ensure StringToUint wrote something.
+ EXPECT_EQ(i.success, StringToUint(i.input, &output));
+ EXPECT_EQ(i.output, output);
+
+ string16 utf16_input = UTF8ToUTF16(i.input);
+ output = i.output ^ 1; // Ensure StringToUint wrote something.
+ EXPECT_EQ(i.success, StringToUint(utf16_input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "6\06";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ unsigned output;
+ EXPECT_FALSE(StringToUint(input_string, &output));
+ EXPECT_EQ(6U, output);
+
+ string16 utf16_input = UTF8ToUTF16(input_string);
+ output = 0;
+ EXPECT_FALSE(StringToUint(utf16_input, &output));
+ EXPECT_EQ(6U, output);
+
+ output = 0;
+ const char16 negative_wide_input[] = { 0xFF4D, '4', '2', 0};
+ EXPECT_FALSE(StringToUint(string16(negative_wide_input), &output));
+ EXPECT_EQ(0U, output);
+}
+
+TEST(StringNumberConversionsTest, StringToInt64) {
+ static const struct {
+ std::string input;
+ int64_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 42, true},
+ {"-2147483648", INT_MIN, true},
+ {"2147483647", INT_MAX, true},
+ {"-2147483649", INT64_C(-2147483649), true},
+ {"-99999999999", INT64_C(-99999999999), true},
+ {"2147483648", INT64_C(2147483648), true},
+ {"99999999999", INT64_C(99999999999), true},
+ {"9223372036854775807", std::numeric_limits<int64_t>::max(), true},
+ {"-9223372036854775808", std::numeric_limits<int64_t>::min(), true},
+ {"09", 9, true},
+ {"-09", -9, true},
+ {"", 0, false},
+ {" 42", 42, false},
+ {"42 ", 42, false},
+ {"0x42", 0, false},
+ {"\t\n\v\f\r 42", 42, false},
+ {"blah42", 0, false},
+ {"42blah", 42, false},
+ {"blah42blah", 0, false},
+ {"-273.15", -273, false},
+ {"+98.6", 98, false},
+ {"--123", 0, false},
+ {"++123", 0, false},
+ {"-+123", 0, false},
+ {"+-123", 0, false},
+ {"-", 0, false},
+ {"-9223372036854775809", std::numeric_limits<int64_t>::min(), false},
+ {"-99999999999999999999", std::numeric_limits<int64_t>::min(), false},
+ {"9223372036854775808", std::numeric_limits<int64_t>::max(), false},
+ {"99999999999999999999", std::numeric_limits<int64_t>::max(), false},
+ };
+
+ for (const auto& i : cases) {
+ int64_t output = 0;
+ EXPECT_EQ(i.success, StringToInt64(i.input, &output));
+ EXPECT_EQ(i.output, output);
+
+ string16 utf16_input = UTF8ToUTF16(i.input);
+ output = 0;
+ EXPECT_EQ(i.success, StringToInt64(utf16_input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "6\06";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ int64_t output;
+ EXPECT_FALSE(StringToInt64(input_string, &output));
+ EXPECT_EQ(6, output);
+
+ string16 utf16_input = UTF8ToUTF16(input_string);
+ output = 0;
+ EXPECT_FALSE(StringToInt64(utf16_input, &output));
+ EXPECT_EQ(6, output);
+}
+
+TEST(StringNumberConversionsTest, StringToUint64) {
+ static const struct {
+ std::string input;
+ uint64_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 42, true},
+ {"-2147483648", 0, false},
+ {"2147483647", INT_MAX, true},
+ {"-2147483649", 0, false},
+ {"-99999999999", 0, false},
+ {"2147483648", UINT64_C(2147483648), true},
+ {"99999999999", UINT64_C(99999999999), true},
+ {"9223372036854775807", std::numeric_limits<int64_t>::max(), true},
+ {"-9223372036854775808", 0, false},
+ {"09", 9, true},
+ {"-09", 0, false},
+ {"", 0, false},
+ {" 42", 42, false},
+ {"42 ", 42, false},
+ {"0x42", 0, false},
+ {"\t\n\v\f\r 42", 42, false},
+ {"blah42", 0, false},
+ {"42blah", 42, false},
+ {"blah42blah", 0, false},
+ {"-273.15", 0, false},
+ {"+98.6", 98, false},
+ {"--123", 0, false},
+ {"++123", 0, false},
+ {"-+123", 0, false},
+ {"+-123", 0, false},
+ {"-", 0, false},
+ {"-9223372036854775809", 0, false},
+ {"-99999999999999999999", 0, false},
+ {"9223372036854775808", UINT64_C(9223372036854775808), true},
+ {"99999999999999999999", std::numeric_limits<uint64_t>::max(), false},
+ {"18446744073709551615", std::numeric_limits<uint64_t>::max(), true},
+ {"18446744073709551616", std::numeric_limits<uint64_t>::max(), false},
+ };
+
+ for (const auto& i : cases) {
+ uint64_t output = 0;
+ EXPECT_EQ(i.success, StringToUint64(i.input, &output));
+ EXPECT_EQ(i.output, output);
+
+ string16 utf16_input = UTF8ToUTF16(i.input);
+ output = 0;
+ EXPECT_EQ(i.success, StringToUint64(utf16_input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "6\06";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ uint64_t output;
+ EXPECT_FALSE(StringToUint64(input_string, &output));
+ EXPECT_EQ(6U, output);
+
+ string16 utf16_input = UTF8ToUTF16(input_string);
+ output = 0;
+ EXPECT_FALSE(StringToUint64(utf16_input, &output));
+ EXPECT_EQ(6U, output);
+}
+
+TEST(StringNumberConversionsTest, StringToSizeT) {
+ size_t size_t_max = std::numeric_limits<size_t>::max();
+ std::string size_t_max_string = StringPrintf("%" PRIuS, size_t_max);
+
+ static const struct {
+ std::string input;
+ size_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 42, true},
+ {"-2147483648", 0, false},
+ {"2147483647", INT_MAX, true},
+ {"-2147483649", 0, false},
+ {"-99999999999", 0, false},
+ {"2147483648", 2147483648U, true},
+#if SIZE_MAX > 4294967295U
+ {"99999999999", 99999999999U, true},
+#endif
+ {"-9223372036854775808", 0, false},
+ {"09", 9, true},
+ {"-09", 0, false},
+ {"", 0, false},
+ {" 42", 42, false},
+ {"42 ", 42, false},
+ {"0x42", 0, false},
+ {"\t\n\v\f\r 42", 42, false},
+ {"blah42", 0, false},
+ {"42blah", 42, false},
+ {"blah42blah", 0, false},
+ {"-273.15", 0, false},
+ {"+98.6", 98, false},
+ {"--123", 0, false},
+ {"++123", 0, false},
+ {"-+123", 0, false},
+ {"+-123", 0, false},
+ {"-", 0, false},
+ {"-9223372036854775809", 0, false},
+ {"-99999999999999999999", 0, false},
+ {"999999999999999999999999", size_t_max, false},
+ {size_t_max_string, size_t_max, true},
+ };
+
+ for (const auto& i : cases) {
+ size_t output = 0;
+ EXPECT_EQ(i.success, StringToSizeT(i.input, &output));
+ EXPECT_EQ(i.output, output);
+
+ string16 utf16_input = UTF8ToUTF16(i.input);
+ output = 0;
+ EXPECT_EQ(i.success, StringToSizeT(utf16_input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "6\06";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ size_t output;
+ EXPECT_FALSE(StringToSizeT(input_string, &output));
+ EXPECT_EQ(6U, output);
+
+ string16 utf16_input = UTF8ToUTF16(input_string);
+ output = 0;
+ EXPECT_FALSE(StringToSizeT(utf16_input, &output));
+ EXPECT_EQ(6U, output);
+}
+
+TEST(StringNumberConversionsTest, HexStringToInt) {
+ static const struct {
+ std::string input;
+ int64_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 66, true},
+ {"-42", -66, true},
+ {"+42", 66, true},
+ {"7fffffff", INT_MAX, true},
+ {"-80000000", INT_MIN, true},
+ {"80000000", INT_MAX, false}, // Overflow test.
+ {"-80000001", INT_MIN, false}, // Underflow test.
+ {"0x42", 66, true},
+ {"-0x42", -66, true},
+ {"+0x42", 66, true},
+ {"0x7fffffff", INT_MAX, true},
+ {"-0x80000000", INT_MIN, true},
+ {"-80000000", INT_MIN, true},
+ {"80000000", INT_MAX, false}, // Overflow test.
+ {"-80000001", INT_MIN, false}, // Underflow test.
+ {"0x0f", 15, true},
+ {"0f", 15, true},
+ {" 45", 0x45, false},
+ {"\t\n\v\f\r 0x45", 0x45, false},
+ {" 45", 0x45, false},
+ {"45 ", 0x45, false},
+ {"45:", 0x45, false},
+ {"efgh", 0xef, false},
+ {"0xefgh", 0xef, false},
+ {"hgfe", 0, false},
+ {"-", 0, false},
+ {"", 0, false},
+ {"0x", 0, false},
+ };
+
+ for (const auto& i : cases) {
+ int output = 0;
+ EXPECT_EQ(i.success, HexStringToInt(i.input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "0xc0ffee\0" "9";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ int output;
+ EXPECT_FALSE(HexStringToInt(input_string, &output));
+ EXPECT_EQ(0xc0ffee, output);
+}
+
+TEST(StringNumberConversionsTest, HexStringToUInt) {
+ static const struct {
+ std::string input;
+ uint32_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 0x42, true},
+ {"-42", 0, false},
+ {"+42", 0x42, true},
+ {"7fffffff", INT_MAX, true},
+ {"-80000000", 0, false},
+ {"ffffffff", 0xffffffff, true},
+ {"DeadBeef", 0xdeadbeef, true},
+ {"0x42", 0x42, true},
+ {"-0x42", 0, false},
+ {"+0x42", 0x42, true},
+ {"0x7fffffff", INT_MAX, true},
+ {"-0x80000000", 0, false},
+ {"0xffffffff", std::numeric_limits<uint32_t>::max(), true},
+ {"0XDeadBeef", 0xdeadbeef, true},
+ {"0x7fffffffffffffff", std::numeric_limits<uint32_t>::max(),
+ false}, // Overflow test.
+ {"-0x8000000000000000", 0, false},
+ {"0x8000000000000000", std::numeric_limits<uint32_t>::max(),
+ false}, // Overflow test.
+ {"-0x8000000000000001", 0, false},
+ {"0xFFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(),
+ false}, // Overflow test.
+ {"FFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(),
+ false}, // Overflow test.
+ {"0x0000000000000000", 0, true},
+ {"0000000000000000", 0, true},
+ {"1FFFFFFFFFFFFFFFF", std::numeric_limits<uint32_t>::max(),
+ false}, // Overflow test.
+ {"0x0f", 0x0f, true},
+ {"0f", 0x0f, true},
+ {" 45", 0x45, false},
+ {"\t\n\v\f\r 0x45", 0x45, false},
+ {" 45", 0x45, false},
+ {"45 ", 0x45, false},
+ {"45:", 0x45, false},
+ {"efgh", 0xef, false},
+ {"0xefgh", 0xef, false},
+ {"hgfe", 0, false},
+ {"-", 0, false},
+ {"", 0, false},
+ {"0x", 0, false},
+ };
+
+ for (const auto& i : cases) {
+ uint32_t output = 0;
+ EXPECT_EQ(i.success, HexStringToUInt(i.input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "0xc0ffee\0" "9";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ uint32_t output;
+ EXPECT_FALSE(HexStringToUInt(input_string, &output));
+ EXPECT_EQ(0xc0ffeeU, output);
+}
+
+TEST(StringNumberConversionsTest, HexStringToInt64) {
+ static const struct {
+ std::string input;
+ int64_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 66, true},
+ {"-42", -66, true},
+ {"+42", 66, true},
+ {"40acd88557b", INT64_C(4444444448123), true},
+ {"7fffffff", INT_MAX, true},
+ {"-80000000", INT_MIN, true},
+ {"ffffffff", 0xffffffff, true},
+ {"DeadBeef", 0xdeadbeef, true},
+ {"0x42", 66, true},
+ {"-0x42", -66, true},
+ {"+0x42", 66, true},
+ {"0x40acd88557b", INT64_C(4444444448123), true},
+ {"0x7fffffff", INT_MAX, true},
+ {"-0x80000000", INT_MIN, true},
+ {"0xffffffff", 0xffffffff, true},
+ {"0XDeadBeef", 0xdeadbeef, true},
+ {"0x7fffffffffffffff", std::numeric_limits<int64_t>::max(), true},
+ {"-0x8000000000000000", std::numeric_limits<int64_t>::min(), true},
+ {"0x8000000000000000", std::numeric_limits<int64_t>::max(),
+ false}, // Overflow test.
+ {"-0x8000000000000001", std::numeric_limits<int64_t>::min(),
+ false}, // Underflow test.
+ {"0x0f", 15, true},
+ {"0f", 15, true},
+ {" 45", 0x45, false},
+ {"\t\n\v\f\r 0x45", 0x45, false},
+ {" 45", 0x45, false},
+ {"45 ", 0x45, false},
+ {"45:", 0x45, false},
+ {"efgh", 0xef, false},
+ {"0xefgh", 0xef, false},
+ {"hgfe", 0, false},
+ {"-", 0, false},
+ {"", 0, false},
+ {"0x", 0, false},
+ };
+
+ for (const auto& i : cases) {
+ int64_t output = 0;
+ EXPECT_EQ(i.success, HexStringToInt64(i.input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "0xc0ffee\0" "9";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ int64_t output;
+ EXPECT_FALSE(HexStringToInt64(input_string, &output));
+ EXPECT_EQ(0xc0ffee, output);
+}
+
+TEST(StringNumberConversionsTest, HexStringToUInt64) {
+ static const struct {
+ std::string input;
+ uint64_t output;
+ bool success;
+ } cases[] = {
+ {"0", 0, true},
+ {"42", 66, true},
+ {"-42", 0, false},
+ {"+42", 66, true},
+ {"40acd88557b", INT64_C(4444444448123), true},
+ {"7fffffff", INT_MAX, true},
+ {"-80000000", 0, false},
+ {"ffffffff", 0xffffffff, true},
+ {"DeadBeef", 0xdeadbeef, true},
+ {"0x42", 66, true},
+ {"-0x42", 0, false},
+ {"+0x42", 66, true},
+ {"0x40acd88557b", INT64_C(4444444448123), true},
+ {"0x7fffffff", INT_MAX, true},
+ {"-0x80000000", 0, false},
+ {"0xffffffff", 0xffffffff, true},
+ {"0XDeadBeef", 0xdeadbeef, true},
+ {"0x7fffffffffffffff", std::numeric_limits<int64_t>::max(), true},
+ {"-0x8000000000000000", 0, false},
+ {"0x8000000000000000", UINT64_C(0x8000000000000000), true},
+ {"-0x8000000000000001", 0, false},
+ {"0xFFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(), true},
+ {"FFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(), true},
+ {"0x0000000000000000", 0, true},
+ {"0000000000000000", 0, true},
+ {"1FFFFFFFFFFFFFFFF", std::numeric_limits<uint64_t>::max(),
+ false}, // Overflow test.
+ {"0x0f", 15, true},
+ {"0f", 15, true},
+ {" 45", 0x45, false},
+ {"\t\n\v\f\r 0x45", 0x45, false},
+ {" 45", 0x45, false},
+ {"45 ", 0x45, false},
+ {"45:", 0x45, false},
+ {"efgh", 0xef, false},
+ {"0xefgh", 0xef, false},
+ {"hgfe", 0, false},
+ {"-", 0, false},
+ {"", 0, false},
+ {"0x", 0, false},
+ };
+
+ for (const auto& i : cases) {
+ uint64_t output = 0;
+ EXPECT_EQ(i.success, HexStringToUInt64(i.input, &output));
+ EXPECT_EQ(i.output, output);
+ }
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "0xc0ffee\0" "9";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ uint64_t output;
+ EXPECT_FALSE(HexStringToUInt64(input_string, &output));
+ EXPECT_EQ(0xc0ffeeU, output);
+}
+
+TEST(StringNumberConversionsTest, HexStringToBytes) {
+ static const struct {
+ const std::string input;
+ const char* output;
+ size_t output_len;
+ bool success;
+ } cases[] = {
+ {"0", "", 0, false}, // odd number of characters fails
+ {"00", "\0", 1, true},
+ {"42", "\x42", 1, true},
+ {"-42", "", 0, false}, // any non-hex value fails
+ {"+42", "", 0, false},
+ {"7fffffff", "\x7f\xff\xff\xff", 4, true},
+ {"80000000", "\x80\0\0\0", 4, true},
+ {"deadbeef", "\xde\xad\xbe\xef", 4, true},
+ {"DeadBeef", "\xde\xad\xbe\xef", 4, true},
+ {"0x42", "", 0, false}, // leading 0x fails (x is not hex)
+ {"0f", "\xf", 1, true},
+ {"45 ", "\x45", 1, false},
+ {"efgh", "\xef", 1, false},
+ {"", "", 0, false},
+ {"0123456789ABCDEF", "\x01\x23\x45\x67\x89\xAB\xCD\xEF", 8, true},
+ {"0123456789ABCDEF012345",
+ "\x01\x23\x45\x67\x89\xAB\xCD\xEF\x01\x23\x45", 11, true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); ++i) {
+ std::vector<uint8_t> output;
+ std::vector<uint8_t> compare;
+ EXPECT_EQ(cases[i].success, HexStringToBytes(cases[i].input, &output)) <<
+ i << ": " << cases[i].input;
+ for (size_t j = 0; j < cases[i].output_len; ++j)
+ compare.push_back(static_cast<uint8_t>(cases[i].output[j]));
+ ASSERT_EQ(output.size(), compare.size()) << i << ": " << cases[i].input;
+ EXPECT_TRUE(std::equal(output.begin(), output.end(), compare.begin())) <<
+ i << ": " << cases[i].input;
+ }
+}
+
+TEST(StringNumberConversionsTest, StringToDouble) {
+ static const struct {
+ std::string input;
+ double output;
+ bool success;
+ } cases[] = {
+ // Test different forms of zero.
+ {"0", 0.0, true},
+ {"+0", 0.0, true},
+ {"-0", 0.0, true},
+ {"0.0", 0.0, true},
+ {"000000000000000000000000000000.0", 0.0, true},
+ {"0.000000000000000000000000000", 0.0, true},
+
+ // Test the answer.
+ {"42", 42.0, true},
+ {"-42", -42.0, true},
+
+ // Test variances of an ordinary number.
+ {"123.45", 123.45, true},
+ {"-123.45", -123.45, true},
+ {"+123.45", 123.45, true},
+
+ // Test different forms of representation.
+ {"2.99792458e8", 299792458.0, true},
+ {"149597870.691E+3", 149597870691.0, true},
+ {"6.", 6.0, true},
+
+ // Test around the largest/smallest value that a double can represent.
+ {"9e307", 9e307, true},
+ {"1.7976e308", 1.7976e308, true},
+ {"1.7977e308", HUGE_VAL, false},
+ {"1.797693134862315807e+308", HUGE_VAL, true},
+ {"1.797693134862315808e+308", HUGE_VAL, false},
+ {"9e308", HUGE_VAL, false},
+ {"9e309", HUGE_VAL, false},
+ {"9e999", HUGE_VAL, false},
+ {"9e1999", HUGE_VAL, false},
+ {"9e19999", HUGE_VAL, false},
+ {"9e99999999999999999999", HUGE_VAL, false},
+ {"-9e307", -9e307, true},
+ {"-1.7976e308", -1.7976e308, true},
+ {"-1.7977e308", -HUGE_VAL, false},
+ {"-1.797693134862315807e+308", -HUGE_VAL, true},
+ {"-1.797693134862315808e+308", -HUGE_VAL, false},
+ {"-9e308", -HUGE_VAL, false},
+ {"-9e309", -HUGE_VAL, false},
+ {"-9e999", -HUGE_VAL, false},
+ {"-9e1999", -HUGE_VAL, false},
+ {"-9e19999", -HUGE_VAL, false},
+ {"-9e99999999999999999999", -HUGE_VAL, false},
+
+ // Test more exponents.
+ {"1e-2", 0.01, true},
+ {"42 ", 42.0, false},
+ {" 1e-2", 0.01, false},
+ {"1e-2 ", 0.01, false},
+ {"-1E-7", -0.0000001, true},
+ {"01e02", 100, true},
+ {"2.3e15", 2.3e15, true},
+ {"100e-309", 100e-309, true},
+
+ // Test some invalid cases.
+ {"\t\n\v\f\r -123.45e2", -12345.0, false},
+ {"+123 e4", 123.0, false},
+ {"123e ", 123.0, false},
+ {"123e", 123.0, false},
+ {" 2.99", 2.99, false},
+ {"1e3.4", 1000.0, false},
+ {"nothing", 0.0, false},
+ {"-", 0.0, false},
+ {"+", 0.0, false},
+ {"", 0.0, false},
+
+ // crbug.org/588726
+ {"-0.0010000000000000000000000000000000000000001e-256",
+ -1.0000000000000001e-259, true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); ++i) {
+ double output;
+ errno = 1;
+ EXPECT_EQ(cases[i].success, StringToDouble(cases[i].input, &output));
+ if (cases[i].success)
+ EXPECT_EQ(1, errno) << i; // confirm that errno is unchanged.
+ EXPECT_DOUBLE_EQ(cases[i].output, output);
+ }
+
+ // One additional test to verify that conversion of numbers in strings with
+ // embedded NUL characters. The NUL and extra data after it should be
+ // interpreted as junk after the number.
+ const char input[] = "3.14\0" "159";
+ std::string input_string(input, gurl_base::size(input) - 1);
+ double output;
+ EXPECT_FALSE(StringToDouble(input_string, &output));
+ EXPECT_DOUBLE_EQ(3.14, output);
+}
+
+TEST(StringNumberConversionsTest, DoubleToString) {
+ static const struct {
+ double input;
+ const char* expected;
+ } cases[] = {
+ {0.0, "0"},
+ {1.25, "1.25"},
+ {1.33518e+012, "1.33518e+12"},
+ {1.33489e+012, "1.33489e+12"},
+ {1.33505e+012, "1.33505e+12"},
+ {1.33545e+009, "1335450000"},
+ {1.33503e+009, "1335030000"},
+ };
+
+ for (const auto& i : cases) {
+ EXPECT_EQ(i.expected, NumberToString(i.input));
+ EXPECT_EQ(i.expected, UTF16ToUTF8(NumberToString16(i.input)));
+ }
+
+ // The following two values were seen in crashes in the wild.
+ const char input_bytes[8] = {0, 0, 0, 0, '\xee', '\x6d', '\x73', '\x42'};
+ double input = 0;
+ memcpy(&input, input_bytes, gurl_base::size(input_bytes));
+ EXPECT_EQ("1335179083776", NumberToString(input));
+ const char input_bytes2[8] =
+ {0, 0, 0, '\xa0', '\xda', '\x6c', '\x73', '\x42'};
+ input = 0;
+ memcpy(&input, input_bytes2, gurl_base::size(input_bytes2));
+ EXPECT_EQ("1334890332160", NumberToString(input));
+}
+
+TEST(StringNumberConversionsTest, HexEncode) {
+ std::string hex(HexEncode(nullptr, 0));
+ EXPECT_EQ(hex.length(), 0U);
+ unsigned char bytes[] = {0x01, 0xff, 0x02, 0xfe, 0x03, 0x80, 0x81};
+ hex = HexEncode(bytes, sizeof(bytes));
+ EXPECT_EQ(hex.compare("01FF02FE038081"), 0);
+}
+
+// Test cases of known-bad strtod conversions that motivated the use of dmg_fp.
+// See https://bugs.chromium.org/p/chromium/issues/detail?id=593512.
+TEST(StringNumberConversionsTest, StrtodFailures) {
+ static const struct {
+ const char* input;
+ uint64_t expected;
+ } cases[] = {
+ // http://www.exploringbinary.com/incorrectly-rounded-conversions-in-visual-c-plus-plus/
+ {"9214843084008499", 0x43405e6cec57761aULL},
+ {"0.500000000000000166533453693773481063544750213623046875",
+ 0x3fe0000000000002ULL},
+ {"30078505129381147446200", 0x44997a3c7271b021ULL},
+ {"1777820000000000000001", 0x4458180d5bad2e3eULL},
+ {"0.500000000000000166547006220929549868969843373633921146392822265625",
+ 0x3fe0000000000002ULL},
+ {"0.50000000000000016656055874808561867439493653364479541778564453125",
+ 0x3fe0000000000002ULL},
+ {"0.3932922657273", 0x3fd92bb352c4623aULL},
+
+ // http://www.exploringbinary.com/incorrectly-rounded-conversions-in-gcc-and-glibc/
+ {"0.500000000000000166533453693773481063544750213623046875",
+ 0x3fe0000000000002ULL},
+ {"3.518437208883201171875e13", 0x42c0000000000002ULL},
+ {"62.5364939768271845828", 0x404f44abd5aa7ca4ULL},
+ {"8.10109172351e-10", 0x3e0bd5cbaef0fd0cULL},
+ {"1.50000000000000011102230246251565404236316680908203125",
+ 0x3ff8000000000000ULL},
+ {"9007199254740991.4999999999999999999999999999999995",
+ 0x433fffffffffffffULL},
+
+ // http://www.exploringbinary.com/incorrect-decimal-to-floating-point-conversion-in-sqlite/
+ {"1e-23", 0x3b282db34012b251ULL},
+ {"8.533e+68", 0x4e3fa69165a8eea2ULL},
+ {"4.1006e-184", 0x19dbe0d1c7ea60c9ULL},
+ {"9.998e+307", 0x7fe1cc0a350ca87bULL},
+ {"9.9538452227e-280", 0x0602117ae45cde43ULL},
+ {"6.47660115e-260", 0x0a1fdd9e333badadULL},
+ {"7.4e+47", 0x49e033d7eca0adefULL},
+ {"5.92e+48", 0x4a1033d7eca0adefULL},
+ {"7.35e+66", 0x4dd172b70eababa9ULL},
+ {"8.32116e+55", 0x4b8b2628393e02cdULL},
+ };
+
+ for (const auto& test : cases) {
+ double output;
+ EXPECT_TRUE(StringToDouble(test.input, &output));
+ EXPECT_EQ(bit_cast<uint64_t>(output), test.expected);
+ }
+}
+
+} // namespace base
diff --git a/base/strings/string_piece.cc b/base/strings/string_piece.cc
new file mode 100644
index 0000000..68f3efc
--- /dev/null
+++ b/base/strings/string_piece.cc
@@ -0,0 +1,450 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Copied from strings/stringpiece.cc with modifications
+
+#include "base/strings/string_piece.h"
+
+#include <limits.h>
+
+#include <algorithm>
+#include <ostream>
+
+#include "polyfills/base/logging.h"
+#include "base/strings/utf_string_conversions.h"
+
+namespace gurl_base {
+namespace {
+
+// For each character in characters_wanted, sets the index corresponding
+// to the ASCII code of that character to 1 in table. This is used by
+// the find_.*_of methods below to tell whether or not a character is in
+// the lookup table in constant time.
+// The argument `table' must be an array that is large enough to hold all
+// the possible values of an unsigned char. Thus it should be be declared
+// as follows:
+// bool table[UCHAR_MAX + 1]
+inline void BuildLookupTable(const StringPiece& characters_wanted,
+ bool* table) {
+ const size_t length = characters_wanted.length();
+ const char* const data = characters_wanted.data();
+ for (size_t i = 0; i < length; ++i) {
+ table[static_cast<unsigned char>(data[i])] = true;
+ }
+}
+
+} // namespace
+
+// MSVC doesn't like complex extern templates and DLLs.
+#if !defined(COMPILER_MSVC)
+template class BasicStringPiece<std::string>;
+template class BasicStringPiece<string16>;
+#endif
+
+std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
+ o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
+ return o;
+}
+
+std::ostream& operator<<(std::ostream& o, const StringPiece16& piece) {
+ return o << UTF16ToUTF8(piece);
+}
+
+namespace internal {
+
+template<typename STR>
+void CopyToStringT(const BasicStringPiece<STR>& self, STR* target) {
+ if (self.empty())
+ target->clear();
+ else
+ target->assign(self.data(), self.size());
+}
+
+void CopyToString(const StringPiece& self, std::string* target) {
+ CopyToStringT(self, target);
+}
+
+void CopyToString(const StringPiece16& self, string16* target) {
+ CopyToStringT(self, target);
+}
+
+template<typename STR>
+void AppendToStringT(const BasicStringPiece<STR>& self, STR* target) {
+ if (!self.empty())
+ target->append(self.data(), self.size());
+}
+
+void AppendToString(const StringPiece& self, std::string* target) {
+ AppendToStringT(self, target);
+}
+
+void AppendToString(const StringPiece16& self, string16* target) {
+ AppendToStringT(self, target);
+}
+
+template<typename STR>
+size_t copyT(const BasicStringPiece<STR>& self,
+ typename STR::value_type* buf,
+ size_t n,
+ size_t pos) {
+ size_t ret = std::min(self.size() - pos, n);
+ memcpy(buf, self.data() + pos, ret * sizeof(typename STR::value_type));
+ return ret;
+}
+
+size_t copy(const StringPiece& self, char* buf, size_t n, size_t pos) {
+ return copyT(self, buf, n, pos);
+}
+
+size_t copy(const StringPiece16& self, char16* buf, size_t n, size_t pos) {
+ return copyT(self, buf, n, pos);
+}
+
+template<typename STR>
+size_t findT(const BasicStringPiece<STR>& self,
+ const BasicStringPiece<STR>& s,
+ size_t pos) {
+ if (pos > self.size())
+ return BasicStringPiece<STR>::npos;
+
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::search(self.begin() + pos, self.end(), s.begin(), s.end());
+ const size_t xpos =
+ static_cast<size_t>(result - self.begin());
+ return xpos + s.size() <= self.size() ? xpos : BasicStringPiece<STR>::npos;
+}
+
+size_t find(const StringPiece& self, const StringPiece& s, size_t pos) {
+ return findT(self, s, pos);
+}
+
+size_t find(const StringPiece16& self, const StringPiece16& s, size_t pos) {
+ return findT(self, s, pos);
+}
+
+template<typename STR>
+size_t findT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (pos >= self.size())
+ return BasicStringPiece<STR>::npos;
+
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::find(self.begin() + pos, self.end(), c);
+ return result != self.end() ?
+ static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos;
+}
+
+size_t find(const StringPiece& self, char c, size_t pos) {
+ return findT(self, c, pos);
+}
+
+size_t find(const StringPiece16& self, char16 c, size_t pos) {
+ return findT(self, c, pos);
+}
+
+template<typename STR>
+size_t rfindT(const BasicStringPiece<STR>& self,
+ const BasicStringPiece<STR>& s,
+ size_t pos) {
+ if (self.size() < s.size())
+ return BasicStringPiece<STR>::npos;
+
+ if (s.empty())
+ return std::min(self.size(), pos);
+
+ typename BasicStringPiece<STR>::const_iterator last =
+ self.begin() + std::min(self.size() - s.size(), pos) + s.size();
+ typename BasicStringPiece<STR>::const_iterator result =
+ std::find_end(self.begin(), last, s.begin(), s.end());
+ return result != last ?
+ static_cast<size_t>(result - self.begin()) : BasicStringPiece<STR>::npos;
+}
+
+size_t rfind(const StringPiece& self, const StringPiece& s, size_t pos) {
+ return rfindT(self, s, pos);
+}
+
+size_t rfind(const StringPiece16& self, const StringPiece16& s, size_t pos) {
+ return rfindT(self, s, pos);
+}
+
+template<typename STR>
+size_t rfindT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (size_t i = std::min(pos, self.size() - 1); ;
+ --i) {
+ if (self.data()[i] == c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t rfind(const StringPiece& self, char c, size_t pos) {
+ return rfindT(self, c, pos);
+}
+
+size_t rfind(const StringPiece16& self, char16 c, size_t pos) {
+ return rfindT(self, c, pos);
+}
+
+// 8-bit version using lookup table.
+size_t find_first_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0 || s.size() == 0)
+ return StringPiece::npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = pos; i < self.size(); ++i) {
+ if (lookup[static_cast<unsigned char>(self.data()[i])]) {
+ return i;
+ }
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute force version.
+size_t find_first_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ StringPiece16::const_iterator found =
+ std::find_first_of(self.begin() + pos, self.end(), s.begin(), s.end());
+ if (found == self.end())
+ return StringPiece16::npos;
+ return found - self.begin();
+}
+
+// 8-bit version using lookup table.
+size_t find_first_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ if (s.size() == 0)
+ return 0;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find_first_not_of(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = pos; i < self.size(); ++i) {
+ if (!lookup[static_cast<unsigned char>(self.data()[i])]) {
+ return i;
+ }
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece16::npos;
+
+ for (size_t self_i = pos; self_i < self.size(); ++self_i) {
+ bool found = false;
+ for (auto c : s) {
+ if (self[self_i] == c) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return self_i;
+ }
+ return StringPiece16::npos;
+}
+
+template<typename STR>
+size_t find_first_not_ofT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (; pos < self.size(); ++pos) {
+ if (self.data()[pos] != c) {
+ return pos;
+ }
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t find_first_not_of(const StringPiece& self,
+ char c,
+ size_t pos) {
+ return find_first_not_ofT(self, c, pos);
+}
+
+size_t find_first_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos) {
+ return find_first_not_ofT(self, c, pos);
+}
+
+// 8-bit version using lookup table.
+size_t find_last_of(const StringPiece& self, const StringPiece& s, size_t pos) {
+ if (self.size() == 0 || s.size() == 0)
+ return StringPiece::npos;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return rfind(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (size_t i = std::min(pos, self.size() - 1); ; --i) {
+ if (lookup[static_cast<unsigned char>(self.data()[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+size_t find_last_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece16::npos;
+
+ for (size_t self_i = std::min(pos, self.size() - 1); ;
+ --self_i) {
+ for (auto c : s) {
+ if (self.data()[self_i] == c)
+ return self_i;
+ }
+ if (self_i == 0)
+ break;
+ }
+ return StringPiece16::npos;
+}
+
+// 8-bit version using lookup table.
+size_t find_last_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ size_t i = std::min(pos, self.size() - 1);
+ if (s.size() == 0)
+ return i;
+
+ // Avoid the cost of BuildLookupTable() for a single-character search.
+ if (s.size() == 1)
+ return find_last_not_of(self, s.data()[0], pos);
+
+ bool lookup[UCHAR_MAX + 1] = { false };
+ BuildLookupTable(s, lookup);
+ for (; ; --i) {
+ if (!lookup[static_cast<unsigned char>(self.data()[i])])
+ return i;
+ if (i == 0)
+ break;
+ }
+ return StringPiece::npos;
+}
+
+// 16-bit brute-force version.
+size_t find_last_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos) {
+ if (self.size() == 0)
+ return StringPiece::npos;
+
+ for (size_t self_i = std::min(pos, self.size() - 1); ; --self_i) {
+ bool found = false;
+ for (auto c : s) {
+ if (self.data()[self_i] == c) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return self_i;
+ if (self_i == 0)
+ break;
+ }
+ return StringPiece16::npos;
+}
+
+template<typename STR>
+size_t find_last_not_ofT(const BasicStringPiece<STR>& self,
+ typename STR::value_type c,
+ size_t pos) {
+ if (self.size() == 0)
+ return BasicStringPiece<STR>::npos;
+
+ for (size_t i = std::min(pos, self.size() - 1); ; --i) {
+ if (self.data()[i] != c)
+ return i;
+ if (i == 0)
+ break;
+ }
+ return BasicStringPiece<STR>::npos;
+}
+
+size_t find_last_not_of(const StringPiece& self,
+ char c,
+ size_t pos) {
+ return find_last_not_ofT(self, c, pos);
+}
+
+size_t find_last_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos) {
+ return find_last_not_ofT(self, c, pos);
+}
+
+template<typename STR>
+BasicStringPiece<STR> substrT(const BasicStringPiece<STR>& self,
+ size_t pos,
+ size_t n) {
+ if (pos > self.size()) pos = self.size();
+ if (n > self.size() - pos) n = self.size() - pos;
+ return BasicStringPiece<STR>(self.data() + pos, n);
+}
+
+StringPiece substr(const StringPiece& self,
+ size_t pos,
+ size_t n) {
+ return substrT(self, pos, n);
+}
+
+StringPiece16 substr(const StringPiece16& self,
+ size_t pos,
+ size_t n) {
+ return substrT(self, pos, n);
+}
+
+#if GURL_DCHECK_IS_ON()
+void AssertIteratorsInOrder(std::string::const_iterator begin,
+ std::string::const_iterator end) {
+ GURL_DCHECK(begin <= end) << "StringPiece iterators swapped or invalid.";
+}
+void AssertIteratorsInOrder(string16::const_iterator begin,
+ string16::const_iterator end) {
+ GURL_DCHECK(begin <= end) << "StringPiece iterators swapped or invalid.";
+}
+#endif
+
+} // namespace internal
+} // namespace base
diff --git a/base/strings/string_piece.h b/base/strings/string_piece.h
new file mode 100644
index 0000000..5359af6
--- /dev/null
+++ b/base/strings/string_piece.h
@@ -0,0 +1,548 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// Copied from strings/stringpiece.h with modifications
+//
+// A string-like object that points to a sized piece of memory.
+//
+// You can use StringPiece as a function or method parameter. A StringPiece
+// parameter can receive a double-quoted string literal argument, a "const
+// char*" argument, a string argument, or a StringPiece argument with no data
+// copying. Systematic use of StringPiece for arguments reduces data
+// copies and strlen() calls.
+//
+// Prefer passing StringPieces by value:
+// void MyFunction(StringPiece arg);
+// If circumstances require, you may also pass by const reference:
+// void MyFunction(const StringPiece& arg); // not preferred
+// Both of these have the same lifetime semantics. Passing by value
+// generates slightly smaller code. For more discussion, Googlers can see
+// the thread go/stringpiecebyvalue on c-users.
+
+#ifndef BASE_STRINGS_STRING_PIECE_H_
+#define BASE_STRINGS_STRING_PIECE_H_
+
+#include <stddef.h>
+
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+
+#include "polyfills/base/base_export.h"
+#include "polyfills/base/logging.h"
+#include "base/strings/char_traits.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece_forward.h"
+
+namespace gurl_base {
+
+// internal --------------------------------------------------------------------
+
+// Many of the StringPiece functions use different implementations for the
+// 8-bit and 16-bit versions, and we don't want lots of template expansions in
+// this (very common) header that will slow down compilation.
+//
+// So here we define overloaded functions called by the StringPiece template.
+// For those that share an implementation, the two versions will expand to a
+// template internal to the .cc file.
+namespace internal {
+
+BASE_EXPORT void CopyToString(const StringPiece& self, std::string* target);
+BASE_EXPORT void CopyToString(const StringPiece16& self, string16* target);
+
+BASE_EXPORT void AppendToString(const StringPiece& self, std::string* target);
+BASE_EXPORT void AppendToString(const StringPiece16& self, string16* target);
+
+BASE_EXPORT size_t copy(const StringPiece& self,
+ char* buf,
+ size_t n,
+ size_t pos);
+BASE_EXPORT size_t copy(const StringPiece16& self,
+ char16* buf,
+ size_t n,
+ size_t pos);
+
+BASE_EXPORT size_t find(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t rfind(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t rfind(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_first_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+
+BASE_EXPORT size_t find_first_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find_first_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_last_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece& self,
+ char c,
+ size_t pos);
+BASE_EXPORT size_t find_last_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+
+BASE_EXPORT size_t find_last_not_of(const StringPiece& self,
+ const StringPiece& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece16& self,
+ const StringPiece16& s,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece16& self,
+ char16 c,
+ size_t pos);
+BASE_EXPORT size_t find_last_not_of(const StringPiece& self,
+ char c,
+ size_t pos);
+
+BASE_EXPORT StringPiece substr(const StringPiece& self,
+ size_t pos,
+ size_t n);
+BASE_EXPORT StringPiece16 substr(const StringPiece16& self,
+ size_t pos,
+ size_t n);
+
+#if GURL_DCHECK_IS_ON()
+// Asserts that begin <= end to catch some errors with iterator usage.
+BASE_EXPORT void AssertIteratorsInOrder(std::string::const_iterator begin,
+ std::string::const_iterator end);
+BASE_EXPORT void AssertIteratorsInOrder(string16::const_iterator begin,
+ string16::const_iterator end);
+#endif
+
+} // namespace internal
+
+// BasicStringPiece ------------------------------------------------------------
+
+// Defines the types, methods, operators, and data members common to both
+// StringPiece and StringPiece16. Do not refer to this class directly, but
+// rather to BasicStringPiece, StringPiece, or StringPiece16.
+//
+// This is templatized by string class type rather than character type, so
+// BasicStringPiece<std::string> or BasicStringPiece<gurl_base::string16>.
+template <typename STRING_TYPE> class BasicStringPiece {
+ public:
+ // Standard STL container boilerplate.
+ typedef size_t size_type;
+ typedef typename STRING_TYPE::value_type value_type;
+ typedef const value_type* pointer;
+ typedef const value_type& reference;
+ typedef const value_type& const_reference;
+ typedef ptrdiff_t difference_type;
+ typedef const value_type* const_iterator;
+ typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+ static const size_type npos;
+
+ public:
+ // We provide non-explicit singleton constructors so users can pass
+ // in a "const char*" or a "string" wherever a "StringPiece" is
+ // expected (likewise for char16, string16, StringPiece16).
+ constexpr BasicStringPiece() : ptr_(NULL), length_(0) {}
+ // TODO(dcheng): Construction from nullptr is not allowed for
+ // std::basic_string_view, so remove the special handling for it.
+ // Note: This doesn't just use STRING_TYPE::traits_type::length(), since that
+ // isn't constexpr until C++17.
+ constexpr BasicStringPiece(const value_type* str)
+ : ptr_(str), length_(!str ? 0 : CharTraits<value_type>::length(str)) {}
+ BasicStringPiece(const STRING_TYPE& str)
+ : ptr_(str.data()), length_(str.size()) {}
+ constexpr BasicStringPiece(const value_type* offset, size_type len)
+ : ptr_(offset), length_(len) {}
+ BasicStringPiece(const typename STRING_TYPE::const_iterator& begin,
+ const typename STRING_TYPE::const_iterator& end) {
+#if GURL_DCHECK_IS_ON()
+ // This assertion is done out-of-line to avoid bringing in logging.h and
+ // instantiating logging macros for every instantiation.
+ internal::AssertIteratorsInOrder(begin, end);
+#endif
+ length_ = static_cast<size_t>(std::distance(begin, end));
+
+ // The length test before assignment is to avoid dereferencing an iterator
+ // that may point to the end() of a string.
+ ptr_ = length_ > 0 ? &*begin : nullptr;
+ }
+
+ // data() may return a pointer to a buffer with embedded NULs, and the
+ // returned buffer may or may not be null terminated. Therefore it is
+ // typically a mistake to pass data() to a routine that expects a NUL
+ // terminated string.
+ constexpr const value_type* data() const { return ptr_; }
+ constexpr size_type size() const noexcept { return length_; }
+ constexpr size_type length() const noexcept { return length_; }
+ bool empty() const { return length_ == 0; }
+
+ void clear() {
+ ptr_ = NULL;
+ length_ = 0;
+ }
+ void set(const value_type* data, size_type len) {
+ ptr_ = data;
+ length_ = len;
+ }
+ void set(const value_type* str) {
+ ptr_ = str;
+ length_ = str ? STRING_TYPE::traits_type::length(str) : 0;
+ }
+
+ constexpr value_type operator[](size_type i) const {
+ GURL_CHECK(i < length_);
+ return ptr_[i];
+ }
+
+ value_type front() const {
+ GURL_CHECK_NE(0UL, length_);
+ return ptr_[0];
+ }
+
+ value_type back() const {
+ GURL_CHECK_NE(0UL, length_);
+ return ptr_[length_ - 1];
+ }
+
+ constexpr void remove_prefix(size_type n) {
+ GURL_CHECK(n <= length_);
+ ptr_ += n;
+ length_ -= n;
+ }
+
+ constexpr void remove_suffix(size_type n) {
+ GURL_CHECK(n <= length_);
+ length_ -= n;
+ }
+
+ constexpr int compare(BasicStringPiece x) const noexcept {
+ int r = CharTraits<value_type>::compare(
+ ptr_, x.ptr_, (length_ < x.length_ ? length_ : x.length_));
+ if (r == 0) {
+ if (length_ < x.length_) r = -1;
+ else if (length_ > x.length_) r = +1;
+ }
+ return r;
+ }
+
+ // This is the style of conversion preferred by std::string_view in C++17.
+ explicit operator STRING_TYPE() const { return as_string(); }
+
+ STRING_TYPE as_string() const {
+ // std::string doesn't like to take a NULL pointer even with a 0 size.
+ return empty() ? STRING_TYPE() : STRING_TYPE(data(), size());
+ }
+
+ const_iterator begin() const { return ptr_; }
+ const_iterator end() const { return ptr_ + length_; }
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(ptr_ + length_);
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(ptr_);
+ }
+
+ size_type max_size() const { return length_; }
+ size_type capacity() const { return length_; }
+
+ // Sets the value of the given string target type to be the current string.
+ // This saves a temporary over doing |a = b.as_string()|
+ void CopyToString(STRING_TYPE* target) const {
+ internal::CopyToString(*this, target);
+ }
+
+ void AppendToString(STRING_TYPE* target) const {
+ internal::AppendToString(*this, target);
+ }
+
+ size_type copy(value_type* buf, size_type n, size_type pos = 0) const {
+ return internal::copy(*this, buf, n, pos);
+ }
+
+ // Does "this" start with "x"
+ constexpr bool starts_with(BasicStringPiece x) const noexcept {
+ return (
+ (this->length_ >= x.length_) &&
+ (CharTraits<value_type>::compare(this->ptr_, x.ptr_, x.length_) == 0));
+ }
+
+ // Does "this" end with "x"
+ constexpr bool ends_with(BasicStringPiece x) const noexcept {
+ return ((this->length_ >= x.length_) &&
+ (CharTraits<value_type>::compare(
+ this->ptr_ + (this->length_ - x.length_), x.ptr_, x.length_) ==
+ 0));
+ }
+
+ // find: Search for a character or substring at a given offset.
+ size_type find(const BasicStringPiece<STRING_TYPE>& s,
+ size_type pos = 0) const {
+ return internal::find(*this, s, pos);
+ }
+ size_type find(value_type c, size_type pos = 0) const {
+ return internal::find(*this, c, pos);
+ }
+
+ // rfind: Reverse find.
+ size_type rfind(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::rfind(*this, s, pos);
+ }
+ size_type rfind(value_type c, size_type pos = BasicStringPiece::npos) const {
+ return internal::rfind(*this, c, pos);
+ }
+
+ // find_first_of: Find the first occurence of one of a set of characters.
+ size_type find_first_of(const BasicStringPiece& s,
+ size_type pos = 0) const {
+ return internal::find_first_of(*this, s, pos);
+ }
+ size_type find_first_of(value_type c, size_type pos = 0) const {
+ return find(c, pos);
+ }
+
+ // find_first_not_of: Find the first occurence not of a set of characters.
+ size_type find_first_not_of(const BasicStringPiece& s,
+ size_type pos = 0) const {
+ return internal::find_first_not_of(*this, s, pos);
+ }
+ size_type find_first_not_of(value_type c, size_type pos = 0) const {
+ return internal::find_first_not_of(*this, c, pos);
+ }
+
+ // find_last_of: Find the last occurence of one of a set of characters.
+ size_type find_last_of(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_of(*this, s, pos);
+ }
+ size_type find_last_of(value_type c,
+ size_type pos = BasicStringPiece::npos) const {
+ return rfind(c, pos);
+ }
+
+ // find_last_not_of: Find the last occurence not of a set of characters.
+ size_type find_last_not_of(const BasicStringPiece& s,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_not_of(*this, s, pos);
+ }
+ size_type find_last_not_of(value_type c,
+ size_type pos = BasicStringPiece::npos) const {
+ return internal::find_last_not_of(*this, c, pos);
+ }
+
+ // substr.
+ BasicStringPiece substr(size_type pos,
+ size_type n = BasicStringPiece::npos) const {
+ return internal::substr(*this, pos, n);
+ }
+
+ protected:
+ const value_type* ptr_;
+ size_type length_;
+};
+
+template <typename STRING_TYPE>
+const typename BasicStringPiece<STRING_TYPE>::size_type
+BasicStringPiece<STRING_TYPE>::npos =
+ typename BasicStringPiece<STRING_TYPE>::size_type(-1);
+
+// MSVC doesn't like complex extern templates and DLLs.
+#if !defined(COMPILER_MSVC)
+extern template class BASE_EXPORT BasicStringPiece<std::string>;
+extern template class BASE_EXPORT BasicStringPiece<string16>;
+#endif
+
+// Comparison operators --------------------------------------------------------
+// operator ==
+template <typename StringT>
+constexpr bool operator==(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return lhs.size() == rhs.size() && lhs.compare(rhs) == 0;
+}
+
+// Here and below we make use of std::common_type_t to emulate an identity type
+// transformation. This creates a non-deduced context, so that we can compare
+// StringPieces with types that implicitly convert to StringPieces. See
+// https://wg21.link/n3766 for details.
+// Furthermore, we require dummy template parameters for these overloads to work
+// around a name mangling issue on Windows.
+template <typename StringT, int = 1>
+constexpr bool operator==(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return lhs.size() == rhs.size() && lhs.compare(rhs) == 0;
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator==(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return lhs.size() == rhs.size() && lhs.compare(rhs) == 0;
+}
+
+// operator !=
+template <typename StringT>
+constexpr bool operator!=(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(lhs == rhs);
+}
+
+template <typename StringT, int = 1>
+constexpr bool operator!=(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return !(lhs == rhs);
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator!=(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(lhs == rhs);
+}
+
+// operator <
+template <typename StringT>
+constexpr bool operator<(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return lhs.compare(rhs) < 0;
+}
+
+template <typename StringT, int = 1>
+constexpr bool operator<(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return lhs.compare(rhs) < 0;
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator<(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return lhs.compare(rhs) < 0;
+}
+
+// operator >
+template <typename StringT>
+constexpr bool operator>(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return rhs < lhs;
+}
+
+template <typename StringT, int = 1>
+constexpr bool operator>(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return rhs < lhs;
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator>(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return rhs < lhs;
+}
+
+// operator <=
+template <typename StringT>
+constexpr bool operator<=(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(rhs < lhs);
+}
+
+template <typename StringT, int = 1>
+constexpr bool operator<=(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return !(rhs < lhs);
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator<=(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(rhs < lhs);
+}
+
+// operator >=
+template <typename StringT>
+constexpr bool operator>=(BasicStringPiece<StringT> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(lhs < rhs);
+}
+
+template <typename StringT, int = 1>
+constexpr bool operator>=(
+ BasicStringPiece<StringT> lhs,
+ std::common_type_t<BasicStringPiece<StringT>> rhs) noexcept {
+ return !(lhs < rhs);
+}
+
+template <typename StringT, int = 2>
+constexpr bool operator>=(std::common_type_t<BasicStringPiece<StringT>> lhs,
+ BasicStringPiece<StringT> rhs) noexcept {
+ return !(lhs < rhs);
+}
+
+BASE_EXPORT std::ostream& operator<<(std::ostream& o,
+ const StringPiece& piece);
+
+BASE_EXPORT std::ostream& operator<<(std::ostream& o,
+ const StringPiece16& piece);
+
+// Hashing ---------------------------------------------------------------------
+
+// We provide appropriate hash functions so StringPiece and StringPiece16 can
+// be used as keys in hash sets and maps.
+
+// This hash function is copied from base/strings/string16.h. We don't use the
+// ones already defined for string and string16 directly because it would
+// require the string constructors to be called, which we don't want.
+
+template <typename StringPieceType>
+struct StringPieceHashImpl {
+ std::size_t operator()(StringPieceType sp) const {
+ std::size_t result = 0;
+ for (auto c : sp)
+ result = (result * 131) + c;
+ return result;
+ }
+};
+
+using StringPieceHash = StringPieceHashImpl<StringPiece>;
+using StringPiece16Hash = StringPieceHashImpl<StringPiece16>;
+using WStringPieceHash = StringPieceHashImpl<WStringPiece>;
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_PIECE_H_
diff --git a/base/strings/string_piece_forward.h b/base/strings/string_piece_forward.h
new file mode 100644
index 0000000..aa79117
--- /dev/null
+++ b/base/strings/string_piece_forward.h
@@ -0,0 +1,24 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Forward declaration of StringPiece types from base/strings/string_piece.h
+
+#ifndef BASE_STRINGS_STRING_PIECE_FORWARD_H_
+#define BASE_STRINGS_STRING_PIECE_FORWARD_H_
+
+#include <string>
+
+#include "base/strings/string16.h"
+
+namespace gurl_base {
+
+template <typename STRING_TYPE>
+class BasicStringPiece;
+typedef BasicStringPiece<std::string> StringPiece;
+typedef BasicStringPiece<string16> StringPiece16;
+typedef BasicStringPiece<std::wstring> WStringPiece;
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_PIECE_FORWARD_H_
diff --git a/base/strings/string_piece_unittest.cc b/base/strings/string_piece_unittest.cc
new file mode 100644
index 0000000..8e245e6
--- /dev/null
+++ b/base/strings/string_piece_unittest.cc
@@ -0,0 +1,838 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <string>
+
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+template <typename T>
+class CommonStringPieceTest : public ::testing::Test {
+ public:
+ static const T as_string(const char* input) {
+ return T(input);
+ }
+ static const T& as_string(const T& input) {
+ return input;
+ }
+};
+
+template <>
+class CommonStringPieceTest<string16> : public ::testing::Test {
+ public:
+ static const string16 as_string(const char* input) {
+ return ASCIIToUTF16(input);
+ }
+ static const string16 as_string(const std::string& input) {
+ return ASCIIToUTF16(input);
+ }
+};
+
+typedef ::testing::Types<std::string, string16> SupportedStringTypes;
+
+TYPED_TEST_SUITE(CommonStringPieceTest, SupportedStringTypes);
+
+TYPED_TEST(CommonStringPieceTest, CheckComparisonOperators) {
+#define CMP_Y(op, x, y) \
+ { \
+ TypeParam lhs(TestFixture::as_string(x)); \
+ TypeParam rhs(TestFixture::as_string(y)); \
+ ASSERT_TRUE((BasicStringPiece<TypeParam>((lhs.c_str())) \
+ op BasicStringPiece<TypeParam>((rhs.c_str())))); \
+ ASSERT_TRUE(BasicStringPiece<TypeParam>(lhs) op rhs); \
+ ASSERT_TRUE(lhs op BasicStringPiece<TypeParam>(rhs)); \
+ ASSERT_TRUE((BasicStringPiece<TypeParam>((lhs.c_str())) \
+ .compare(BasicStringPiece<TypeParam>((rhs.c_str()))) \
+ op 0)); \
+ }
+
+#define CMP_N(op, x, y) \
+ { \
+ TypeParam lhs(TestFixture::as_string(x)); \
+ TypeParam rhs(TestFixture::as_string(y)); \
+ ASSERT_FALSE((BasicStringPiece<TypeParam>((lhs.c_str())) \
+ op BasicStringPiece<TypeParam>((rhs.c_str())))); \
+ ASSERT_FALSE(BasicStringPiece<TypeParam>(lhs) op rhs); \
+ ASSERT_FALSE(lhs op BasicStringPiece<TypeParam>(rhs)); \
+ ASSERT_FALSE((BasicStringPiece<TypeParam>((lhs.c_str())) \
+ .compare(BasicStringPiece<TypeParam>((rhs.c_str()))) \
+ op 0)); \
+ }
+
+ CMP_Y(==, "", "")
+ CMP_Y(==, "a", "a")
+ CMP_Y(==, "aa", "aa")
+ CMP_N(==, "a", "")
+ CMP_N(==, "", "a")
+ CMP_N(==, "a", "b")
+ CMP_N(==, "a", "aa")
+ CMP_N(==, "aa", "a")
+
+ CMP_N(!=, "", "")
+ CMP_N(!=, "a", "a")
+ CMP_N(!=, "aa", "aa")
+ CMP_Y(!=, "a", "")
+ CMP_Y(!=, "", "a")
+ CMP_Y(!=, "a", "b")
+ CMP_Y(!=, "a", "aa")
+ CMP_Y(!=, "aa", "a")
+
+ CMP_Y(<, "a", "b")
+ CMP_Y(<, "a", "aa")
+ CMP_Y(<, "aa", "b")
+ CMP_Y(<, "aa", "bb")
+ CMP_N(<, "a", "a")
+ CMP_N(<, "b", "a")
+ CMP_N(<, "aa", "a")
+ CMP_N(<, "b", "aa")
+ CMP_N(<, "bb", "aa")
+
+ CMP_Y(<=, "a", "a")
+ CMP_Y(<=, "a", "b")
+ CMP_Y(<=, "a", "aa")
+ CMP_Y(<=, "aa", "b")
+ CMP_Y(<=, "aa", "bb")
+ CMP_N(<=, "b", "a")
+ CMP_N(<=, "aa", "a")
+ CMP_N(<=, "b", "aa")
+ CMP_N(<=, "bb", "aa")
+
+ CMP_N(>=, "a", "b")
+ CMP_N(>=, "a", "aa")
+ CMP_N(>=, "aa", "b")
+ CMP_N(>=, "aa", "bb")
+ CMP_Y(>=, "a", "a")
+ CMP_Y(>=, "b", "a")
+ CMP_Y(>=, "aa", "a")
+ CMP_Y(>=, "b", "aa")
+ CMP_Y(>=, "bb", "aa")
+
+ CMP_N(>, "a", "a")
+ CMP_N(>, "a", "b")
+ CMP_N(>, "a", "aa")
+ CMP_N(>, "aa", "b")
+ CMP_N(>, "aa", "bb")
+ CMP_Y(>, "b", "a")
+ CMP_Y(>, "aa", "a")
+ CMP_Y(>, "b", "aa")
+ CMP_Y(>, "bb", "aa")
+
+ std::string x;
+ for (int i = 0; i < 256; i++) {
+ x += 'a';
+ std::string y = x;
+ CMP_Y(==, x, y);
+ for (int j = 0; j < i; j++) {
+ std::string z = x;
+ z[j] = 'b'; // Differs in position 'j'
+ CMP_N(==, x, z);
+ }
+ }
+
+#undef CMP_Y
+#undef CMP_N
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckSTL) {
+ TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz"));
+ TypeParam abc(TestFixture::as_string("abc"));
+ TypeParam xyz(TestFixture::as_string("xyz"));
+ TypeParam foobar(TestFixture::as_string("foobar"));
+
+ BasicStringPiece<TypeParam> a(alphabet);
+ BasicStringPiece<TypeParam> b(abc);
+ BasicStringPiece<TypeParam> c(xyz);
+ BasicStringPiece<TypeParam> d(foobar);
+ BasicStringPiece<TypeParam> e;
+ TypeParam temp(TestFixture::as_string("123"));
+ temp += static_cast<typename TypeParam::value_type>(0);
+ temp += TestFixture::as_string("456");
+ BasicStringPiece<TypeParam> f(temp);
+
+ ASSERT_EQ(a[6], static_cast<typename TypeParam::value_type>('g'));
+ ASSERT_EQ(b[0], static_cast<typename TypeParam::value_type>('a'));
+ ASSERT_EQ(c[2], static_cast<typename TypeParam::value_type>('z'));
+ ASSERT_EQ(f[3], static_cast<typename TypeParam::value_type>('\0'));
+ ASSERT_EQ(f[5], static_cast<typename TypeParam::value_type>('5'));
+
+ ASSERT_EQ(*d.data(), static_cast<typename TypeParam::value_type>('f'));
+ ASSERT_EQ(d.data()[5], static_cast<typename TypeParam::value_type>('r'));
+ ASSERT_EQ(e.data(), nullptr);
+
+ ASSERT_EQ(*a.begin(), static_cast<typename TypeParam::value_type>('a'));
+ ASSERT_EQ(*(b.begin() + 2), static_cast<typename TypeParam::value_type>('c'));
+ ASSERT_EQ(*(c.end() - 1), static_cast<typename TypeParam::value_type>('z'));
+
+ ASSERT_EQ(*a.rbegin(), static_cast<typename TypeParam::value_type>('z'));
+ ASSERT_EQ(*(b.rbegin() + 2),
+ static_cast<typename TypeParam::value_type>('a'));
+ ASSERT_EQ(*(c.rend() - 1), static_cast<typename TypeParam::value_type>('x'));
+ ASSERT_EQ(a.rbegin() + 26, a.rend());
+
+ ASSERT_EQ(a.size(), 26U);
+ ASSERT_EQ(b.size(), 3U);
+ ASSERT_EQ(c.size(), 3U);
+ ASSERT_EQ(d.size(), 6U);
+ ASSERT_EQ(e.size(), 0U);
+ ASSERT_EQ(f.size(), 7U);
+
+ ASSERT_TRUE(!d.empty());
+ ASSERT_TRUE(d.begin() != d.end());
+ ASSERT_EQ(d.begin() + 6, d.end());
+
+ ASSERT_TRUE(e.empty());
+ ASSERT_EQ(e.begin(), e.end());
+
+ d.clear();
+ ASSERT_EQ(d.size(), 0U);
+ ASSERT_TRUE(d.empty());
+ ASSERT_EQ(d.data(), nullptr);
+ ASSERT_EQ(d.begin(), d.end());
+
+ ASSERT_GE(a.max_size(), a.capacity());
+ ASSERT_GE(a.capacity(), a.size());
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckFind) {
+ typedef BasicStringPiece<TypeParam> Piece;
+
+ TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz"));
+ TypeParam abc(TestFixture::as_string("abc"));
+ TypeParam xyz(TestFixture::as_string("xyz"));
+ TypeParam foobar(TestFixture::as_string("foobar"));
+
+ BasicStringPiece<TypeParam> a(alphabet);
+ BasicStringPiece<TypeParam> b(abc);
+ BasicStringPiece<TypeParam> c(xyz);
+ BasicStringPiece<TypeParam> d(foobar);
+
+ d.clear();
+ Piece e;
+ TypeParam temp(TestFixture::as_string("123"));
+ temp.push_back('\0');
+ temp += TestFixture::as_string("456");
+ Piece f(temp);
+
+ typename TypeParam::value_type buf[4] = { '%', '%', '%', '%' };
+ ASSERT_EQ(a.copy(buf, 4), 4U);
+ ASSERT_EQ(buf[0], a[0]);
+ ASSERT_EQ(buf[1], a[1]);
+ ASSERT_EQ(buf[2], a[2]);
+ ASSERT_EQ(buf[3], a[3]);
+ ASSERT_EQ(a.copy(buf, 3, 7), 3U);
+ ASSERT_EQ(buf[0], a[7]);
+ ASSERT_EQ(buf[1], a[8]);
+ ASSERT_EQ(buf[2], a[9]);
+ ASSERT_EQ(buf[3], a[3]);
+ ASSERT_EQ(c.copy(buf, 99), 3U);
+ ASSERT_EQ(buf[0], c[0]);
+ ASSERT_EQ(buf[1], c[1]);
+ ASSERT_EQ(buf[2], c[2]);
+ ASSERT_EQ(buf[3], a[3]);
+
+ ASSERT_EQ(Piece::npos, TypeParam::npos);
+
+ ASSERT_EQ(a.find(b), 0U);
+ ASSERT_EQ(a.find(b, 1), Piece::npos);
+ ASSERT_EQ(a.find(c), 23U);
+ ASSERT_EQ(a.find(c, 9), 23U);
+ ASSERT_EQ(a.find(c, Piece::npos), Piece::npos);
+ ASSERT_EQ(b.find(c), Piece::npos);
+ ASSERT_EQ(b.find(c, Piece::npos), Piece::npos);
+ ASSERT_EQ(a.find(d), 0U);
+ ASSERT_EQ(a.find(e), 0U);
+ ASSERT_EQ(a.find(d, 12), 12U);
+ ASSERT_EQ(a.find(e, 17), 17U);
+ TypeParam not_found(TestFixture::as_string("xx not found bb"));
+ Piece g(not_found);
+ ASSERT_EQ(a.find(g), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(d.find(b), Piece::npos);
+ ASSERT_EQ(e.find(b), Piece::npos);
+ ASSERT_EQ(d.find(b, 4), Piece::npos);
+ ASSERT_EQ(e.find(b, 7), Piece::npos);
+
+ size_t empty_search_pos = TypeParam().find(TypeParam());
+ ASSERT_EQ(d.find(d), empty_search_pos);
+ ASSERT_EQ(d.find(e), empty_search_pos);
+ ASSERT_EQ(e.find(d), empty_search_pos);
+ ASSERT_EQ(e.find(e), empty_search_pos);
+ ASSERT_EQ(d.find(d, 4), std::string().find(std::string(), 4));
+ ASSERT_EQ(d.find(e, 4), std::string().find(std::string(), 4));
+ ASSERT_EQ(e.find(d, 4), std::string().find(std::string(), 4));
+ ASSERT_EQ(e.find(e, 4), std::string().find(std::string(), 4));
+
+ ASSERT_EQ(a.find('a'), 0U);
+ ASSERT_EQ(a.find('c'), 2U);
+ ASSERT_EQ(a.find('z'), 25U);
+ ASSERT_EQ(a.find('$'), Piece::npos);
+ ASSERT_EQ(a.find('\0'), Piece::npos);
+ ASSERT_EQ(f.find('\0'), 3U);
+ ASSERT_EQ(f.find('3'), 2U);
+ ASSERT_EQ(f.find('5'), 5U);
+ ASSERT_EQ(g.find('o'), 4U);
+ ASSERT_EQ(g.find('o', 4), 4U);
+ ASSERT_EQ(g.find('o', 5), 8U);
+ ASSERT_EQ(a.find('b', 5), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(d.find('\0'), Piece::npos);
+ ASSERT_EQ(e.find('\0'), Piece::npos);
+ ASSERT_EQ(d.find('\0', 4), Piece::npos);
+ ASSERT_EQ(e.find('\0', 7), Piece::npos);
+ ASSERT_EQ(d.find('x'), Piece::npos);
+ ASSERT_EQ(e.find('x'), Piece::npos);
+ ASSERT_EQ(d.find('x', 4), Piece::npos);
+ ASSERT_EQ(e.find('x', 7), Piece::npos);
+
+ ASSERT_EQ(a.rfind(b), 0U);
+ ASSERT_EQ(a.rfind(b, 1), 0U);
+ ASSERT_EQ(a.rfind(c), 23U);
+ ASSERT_EQ(a.rfind(c, 22U), Piece::npos);
+ ASSERT_EQ(a.rfind(c, 1U), Piece::npos);
+ ASSERT_EQ(a.rfind(c, 0U), Piece::npos);
+ ASSERT_EQ(b.rfind(c), Piece::npos);
+ ASSERT_EQ(b.rfind(c, 0U), Piece::npos);
+ ASSERT_EQ(a.rfind(d), static_cast<size_t>(a.as_string().rfind(TypeParam())));
+ ASSERT_EQ(a.rfind(e), a.as_string().rfind(TypeParam()));
+ ASSERT_EQ(a.rfind(d), static_cast<size_t>(TypeParam(a).rfind(TypeParam())));
+ ASSERT_EQ(a.rfind(e), TypeParam(a).rfind(TypeParam()));
+ ASSERT_EQ(a.rfind(d, 12), 12U);
+ ASSERT_EQ(a.rfind(e, 17), 17U);
+ ASSERT_EQ(a.rfind(g), Piece::npos);
+ ASSERT_EQ(d.rfind(b), Piece::npos);
+ ASSERT_EQ(e.rfind(b), Piece::npos);
+ ASSERT_EQ(d.rfind(b, 4), Piece::npos);
+ ASSERT_EQ(e.rfind(b, 7), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(d.rfind(d, 4), std::string().rfind(std::string()));
+ ASSERT_EQ(e.rfind(d, 7), std::string().rfind(std::string()));
+ ASSERT_EQ(d.rfind(e, 4), std::string().rfind(std::string()));
+ ASSERT_EQ(e.rfind(e, 7), std::string().rfind(std::string()));
+ ASSERT_EQ(d.rfind(d), std::string().rfind(std::string()));
+ ASSERT_EQ(e.rfind(d), std::string().rfind(std::string()));
+ ASSERT_EQ(d.rfind(e), std::string().rfind(std::string()));
+ ASSERT_EQ(e.rfind(e), std::string().rfind(std::string()));
+
+ ASSERT_EQ(g.rfind('o'), 8U);
+ ASSERT_EQ(g.rfind('q'), Piece::npos);
+ ASSERT_EQ(g.rfind('o', 8), 8U);
+ ASSERT_EQ(g.rfind('o', 7), 4U);
+ ASSERT_EQ(g.rfind('o', 3), Piece::npos);
+ ASSERT_EQ(f.rfind('\0'), 3U);
+ ASSERT_EQ(f.rfind('\0', 12), 3U);
+ ASSERT_EQ(f.rfind('3'), 2U);
+ ASSERT_EQ(f.rfind('5'), 5U);
+ // empty string nonsense
+ ASSERT_EQ(d.rfind('o'), Piece::npos);
+ ASSERT_EQ(e.rfind('o'), Piece::npos);
+ ASSERT_EQ(d.rfind('o', 4), Piece::npos);
+ ASSERT_EQ(e.rfind('o', 7), Piece::npos);
+
+ TypeParam one_two_three_four(TestFixture::as_string("one,two:three;four"));
+ TypeParam comma_colon(TestFixture::as_string(",:"));
+ ASSERT_EQ(3U, Piece(one_two_three_four).find_first_of(comma_colon));
+ ASSERT_EQ(a.find_first_of(b), 0U);
+ ASSERT_EQ(a.find_first_of(b, 0), 0U);
+ ASSERT_EQ(a.find_first_of(b, 1), 1U);
+ ASSERT_EQ(a.find_first_of(b, 2), 2U);
+ ASSERT_EQ(a.find_first_of(b, 3), Piece::npos);
+ ASSERT_EQ(a.find_first_of(c), 23U);
+ ASSERT_EQ(a.find_first_of(c, 23), 23U);
+ ASSERT_EQ(a.find_first_of(c, 24), 24U);
+ ASSERT_EQ(a.find_first_of(c, 25), 25U);
+ ASSERT_EQ(a.find_first_of(c, 26), Piece::npos);
+ ASSERT_EQ(g.find_first_of(b), 13U);
+ ASSERT_EQ(g.find_first_of(c), 0U);
+ ASSERT_EQ(a.find_first_of(f), Piece::npos);
+ ASSERT_EQ(f.find_first_of(a), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(a.find_first_of(d), Piece::npos);
+ ASSERT_EQ(a.find_first_of(e), Piece::npos);
+ ASSERT_EQ(d.find_first_of(b), Piece::npos);
+ ASSERT_EQ(e.find_first_of(b), Piece::npos);
+ ASSERT_EQ(d.find_first_of(d), Piece::npos);
+ ASSERT_EQ(e.find_first_of(d), Piece::npos);
+ ASSERT_EQ(d.find_first_of(e), Piece::npos);
+ ASSERT_EQ(e.find_first_of(e), Piece::npos);
+
+ ASSERT_EQ(a.find_first_not_of(b), 3U);
+ ASSERT_EQ(a.find_first_not_of(c), 0U);
+ ASSERT_EQ(b.find_first_not_of(a), Piece::npos);
+ ASSERT_EQ(c.find_first_not_of(a), Piece::npos);
+ ASSERT_EQ(f.find_first_not_of(a), 0U);
+ ASSERT_EQ(a.find_first_not_of(f), 0U);
+ ASSERT_EQ(a.find_first_not_of(d), 0U);
+ ASSERT_EQ(a.find_first_not_of(e), 0U);
+ // empty string nonsense
+ ASSERT_EQ(d.find_first_not_of(a), Piece::npos);
+ ASSERT_EQ(e.find_first_not_of(a), Piece::npos);
+ ASSERT_EQ(d.find_first_not_of(d), Piece::npos);
+ ASSERT_EQ(e.find_first_not_of(d), Piece::npos);
+ ASSERT_EQ(d.find_first_not_of(e), Piece::npos);
+ ASSERT_EQ(e.find_first_not_of(e), Piece::npos);
+
+ TypeParam equals(TestFixture::as_string("===="));
+ Piece h(equals);
+ ASSERT_EQ(h.find_first_not_of('='), Piece::npos);
+ ASSERT_EQ(h.find_first_not_of('=', 3), Piece::npos);
+ ASSERT_EQ(h.find_first_not_of('\0'), 0U);
+ ASSERT_EQ(g.find_first_not_of('x'), 2U);
+ ASSERT_EQ(f.find_first_not_of('\0'), 0U);
+ ASSERT_EQ(f.find_first_not_of('\0', 3), 4U);
+ ASSERT_EQ(f.find_first_not_of('\0', 2), 2U);
+ // empty string nonsense
+ ASSERT_EQ(d.find_first_not_of('x'), Piece::npos);
+ ASSERT_EQ(e.find_first_not_of('x'), Piece::npos);
+ ASSERT_EQ(d.find_first_not_of('\0'), Piece::npos);
+ ASSERT_EQ(e.find_first_not_of('\0'), Piece::npos);
+
+ // Piece g("xx not found bb");
+ TypeParam fifty_six(TestFixture::as_string("56"));
+ Piece i(fifty_six);
+ ASSERT_EQ(h.find_last_of(a), Piece::npos);
+ ASSERT_EQ(g.find_last_of(a), g.size()-1);
+ ASSERT_EQ(a.find_last_of(b), 2U);
+ ASSERT_EQ(a.find_last_of(c), a.size()-1);
+ ASSERT_EQ(f.find_last_of(i), 6U);
+ ASSERT_EQ(a.find_last_of('a'), 0U);
+ ASSERT_EQ(a.find_last_of('b'), 1U);
+ ASSERT_EQ(a.find_last_of('z'), 25U);
+ ASSERT_EQ(a.find_last_of('a', 5), 0U);
+ ASSERT_EQ(a.find_last_of('b', 5), 1U);
+ ASSERT_EQ(a.find_last_of('b', 0), Piece::npos);
+ ASSERT_EQ(a.find_last_of('z', 25), 25U);
+ ASSERT_EQ(a.find_last_of('z', 24), Piece::npos);
+ ASSERT_EQ(f.find_last_of(i, 5), 5U);
+ ASSERT_EQ(f.find_last_of(i, 6), 6U);
+ ASSERT_EQ(f.find_last_of(a, 4), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(f.find_last_of(d), Piece::npos);
+ ASSERT_EQ(f.find_last_of(e), Piece::npos);
+ ASSERT_EQ(f.find_last_of(d, 4), Piece::npos);
+ ASSERT_EQ(f.find_last_of(e, 4), Piece::npos);
+ ASSERT_EQ(d.find_last_of(d), Piece::npos);
+ ASSERT_EQ(d.find_last_of(e), Piece::npos);
+ ASSERT_EQ(e.find_last_of(d), Piece::npos);
+ ASSERT_EQ(e.find_last_of(e), Piece::npos);
+ ASSERT_EQ(d.find_last_of(f), Piece::npos);
+ ASSERT_EQ(e.find_last_of(f), Piece::npos);
+ ASSERT_EQ(d.find_last_of(d, 4), Piece::npos);
+ ASSERT_EQ(d.find_last_of(e, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_of(d, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_of(e, 4), Piece::npos);
+ ASSERT_EQ(d.find_last_of(f, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_of(f, 4), Piece::npos);
+
+ ASSERT_EQ(a.find_last_not_of(b), a.size()-1);
+ ASSERT_EQ(a.find_last_not_of(c), 22U);
+ ASSERT_EQ(b.find_last_not_of(a), Piece::npos);
+ ASSERT_EQ(b.find_last_not_of(b), Piece::npos);
+ ASSERT_EQ(f.find_last_not_of(i), 4U);
+ ASSERT_EQ(a.find_last_not_of(c, 24), 22U);
+ ASSERT_EQ(a.find_last_not_of(b, 3), 3U);
+ ASSERT_EQ(a.find_last_not_of(b, 2), Piece::npos);
+ // empty string nonsense
+ ASSERT_EQ(f.find_last_not_of(d), f.size()-1);
+ ASSERT_EQ(f.find_last_not_of(e), f.size()-1);
+ ASSERT_EQ(f.find_last_not_of(d, 4), 4U);
+ ASSERT_EQ(f.find_last_not_of(e, 4), 4U);
+ ASSERT_EQ(d.find_last_not_of(d), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of(e), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(d), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(e), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of(f), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(f), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of(d, 4), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of(e, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(d, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(e, 4), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of(f, 4), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of(f, 4), Piece::npos);
+
+ ASSERT_EQ(h.find_last_not_of('x'), h.size() - 1);
+ ASSERT_EQ(h.find_last_not_of('='), Piece::npos);
+ ASSERT_EQ(b.find_last_not_of('c'), 1U);
+ ASSERT_EQ(h.find_last_not_of('x', 2), 2U);
+ ASSERT_EQ(h.find_last_not_of('=', 2), Piece::npos);
+ ASSERT_EQ(b.find_last_not_of('b', 1), 0U);
+ // empty string nonsense
+ ASSERT_EQ(d.find_last_not_of('x'), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of('x'), Piece::npos);
+ ASSERT_EQ(d.find_last_not_of('\0'), Piece::npos);
+ ASSERT_EQ(e.find_last_not_of('\0'), Piece::npos);
+
+ ASSERT_EQ(a.substr(0, 3), b);
+ ASSERT_EQ(a.substr(23), c);
+ ASSERT_EQ(a.substr(23, 3), c);
+ ASSERT_EQ(a.substr(23, 99), c);
+ ASSERT_EQ(a.substr(0), a);
+ ASSERT_EQ(a.substr(3, 2), TestFixture::as_string("de"));
+ // empty string nonsense
+ ASSERT_EQ(a.substr(99, 2), e);
+ ASSERT_EQ(d.substr(99), e);
+ ASSERT_EQ(d.substr(0, 99), e);
+ ASSERT_EQ(d.substr(99, 99), e);
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckCustom) {
+ TypeParam foobar(TestFixture::as_string("foobar"));
+ BasicStringPiece<TypeParam> a(foobar);
+ TypeParam s1(TestFixture::as_string("123"));
+ s1 += static_cast<typename TypeParam::value_type>('\0');
+ s1 += TestFixture::as_string("456");
+ BasicStringPiece<TypeParam> b(s1);
+ BasicStringPiece<TypeParam> e;
+ TypeParam s2;
+
+ // remove_prefix
+ BasicStringPiece<TypeParam> c(a);
+ c.remove_prefix(3);
+ ASSERT_EQ(c, TestFixture::as_string("bar"));
+ c = a;
+ c.remove_prefix(0);
+ ASSERT_EQ(c, a);
+ c.remove_prefix(c.size());
+ ASSERT_EQ(c, e);
+
+ // remove_suffix
+ c = a;
+ c.remove_suffix(3);
+ ASSERT_EQ(c, TestFixture::as_string("foo"));
+ c = a;
+ c.remove_suffix(0);
+ ASSERT_EQ(c, a);
+ c.remove_suffix(c.size());
+ ASSERT_EQ(c, e);
+
+ // set
+ c.set(foobar.c_str());
+ ASSERT_EQ(c, a);
+ c.set(foobar.c_str(), 6);
+ ASSERT_EQ(c, a);
+ c.set(foobar.c_str(), 0);
+ ASSERT_EQ(c, e);
+ c.set(foobar.c_str(), 7); // Note, has an embedded NULL
+ ASSERT_NE(c, a);
+
+ // as_string
+ TypeParam s3(a.as_string().c_str(), 7); // Note, has an embedded NULL
+ ASSERT_EQ(c, s3);
+ TypeParam s4(e.as_string());
+ ASSERT_TRUE(s4.empty());
+
+ // operator STRING_TYPE()
+ TypeParam s5(TypeParam(a).c_str(), 7); // Note, has an embedded NULL
+ ASSERT_EQ(c, s5);
+ TypeParam s6(e);
+ ASSERT_TRUE(s6.empty());
+}
+
+TEST(StringPieceTest, CheckCustom) {
+ StringPiece a("foobar");
+ std::string s1("123");
+ s1 += '\0';
+ s1 += "456";
+ StringPiece b(s1);
+ StringPiece e;
+ std::string s2;
+
+ // CopyToString
+ a.CopyToString(&s2);
+ ASSERT_EQ(s2.size(), 6U);
+ ASSERT_EQ(s2, "foobar");
+ b.CopyToString(&s2);
+ ASSERT_EQ(s2.size(), 7U);
+ ASSERT_EQ(s1, s2);
+ e.CopyToString(&s2);
+ ASSERT_TRUE(s2.empty());
+
+ // AppendToString
+ s2.erase();
+ a.AppendToString(&s2);
+ ASSERT_EQ(s2.size(), 6U);
+ ASSERT_EQ(s2, "foobar");
+ a.AppendToString(&s2);
+ ASSERT_EQ(s2.size(), 12U);
+ ASSERT_EQ(s2, "foobarfoobar");
+
+ // starts_with
+ ASSERT_TRUE(a.starts_with(a));
+ ASSERT_TRUE(a.starts_with("foo"));
+ ASSERT_TRUE(a.starts_with(e));
+ ASSERT_TRUE(b.starts_with(s1));
+ ASSERT_TRUE(b.starts_with(b));
+ ASSERT_TRUE(b.starts_with(e));
+ ASSERT_TRUE(e.starts_with(""));
+ ASSERT_TRUE(!a.starts_with(b));
+ ASSERT_TRUE(!b.starts_with(a));
+ ASSERT_TRUE(!e.starts_with(a));
+
+ // ends with
+ ASSERT_TRUE(a.ends_with(a));
+ ASSERT_TRUE(a.ends_with("bar"));
+ ASSERT_TRUE(a.ends_with(e));
+ ASSERT_TRUE(b.ends_with(s1));
+ ASSERT_TRUE(b.ends_with(b));
+ ASSERT_TRUE(b.ends_with(e));
+ ASSERT_TRUE(e.ends_with(""));
+ ASSERT_TRUE(!a.ends_with(b));
+ ASSERT_TRUE(!b.ends_with(a));
+ ASSERT_TRUE(!e.ends_with(a));
+
+ StringPiece c;
+ c.set("foobar", 6);
+ ASSERT_EQ(c, a);
+ c.set("foobar", 0);
+ ASSERT_EQ(c, e);
+ c.set("foobar", 7);
+ ASSERT_NE(c, a);
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckNULL) {
+ // we used to crash here, but now we don't.
+ BasicStringPiece<TypeParam> s(nullptr);
+ ASSERT_EQ(s.data(), nullptr);
+ ASSERT_EQ(s.size(), 0U);
+
+ s.set(nullptr);
+ ASSERT_EQ(s.data(), nullptr);
+ ASSERT_EQ(s.size(), 0U);
+
+ TypeParam str(s);
+ ASSERT_EQ(str.length(), 0U);
+ ASSERT_EQ(str, TypeParam());
+
+ str = s.as_string();
+ ASSERT_EQ(str.length(), 0U);
+ ASSERT_EQ(str, TypeParam());
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckComparisons2) {
+ TypeParam alphabet(TestFixture::as_string("abcdefghijklmnopqrstuvwxyz"));
+ TypeParam alphabet_z(TestFixture::as_string("abcdefghijklmnopqrstuvwxyzz"));
+ TypeParam alphabet_y(TestFixture::as_string("abcdefghijklmnopqrstuvwxyy"));
+ BasicStringPiece<TypeParam> abc(alphabet);
+
+ // check comparison operations on strings longer than 4 bytes.
+ ASSERT_EQ(abc, BasicStringPiece<TypeParam>(alphabet));
+ ASSERT_EQ(abc.compare(BasicStringPiece<TypeParam>(alphabet)), 0);
+
+ ASSERT_TRUE(abc < BasicStringPiece<TypeParam>(alphabet_z));
+ ASSERT_LT(abc.compare(BasicStringPiece<TypeParam>(alphabet_z)), 0);
+
+ ASSERT_TRUE(abc > BasicStringPiece<TypeParam>(alphabet_y));
+ ASSERT_GT(abc.compare(BasicStringPiece<TypeParam>(alphabet_y)), 0);
+}
+
+// Test operations only supported by std::string version.
+TEST(StringPieceTest, CheckComparisons2) {
+ StringPiece abc("abcdefghijklmnopqrstuvwxyz");
+
+ // starts_with
+ ASSERT_TRUE(abc.starts_with(abc));
+ ASSERT_TRUE(abc.starts_with("abcdefghijklm"));
+ ASSERT_TRUE(!abc.starts_with("abcdefguvwxyz"));
+
+ // ends_with
+ ASSERT_TRUE(abc.ends_with(abc));
+ ASSERT_TRUE(!abc.ends_with("abcdefguvwxyz"));
+ ASSERT_TRUE(abc.ends_with("nopqrstuvwxyz"));
+}
+
+TYPED_TEST(CommonStringPieceTest, StringCompareNotAmbiguous) {
+ ASSERT_TRUE(TestFixture::as_string("hello").c_str() ==
+ TestFixture::as_string("hello"));
+ ASSERT_TRUE(TestFixture::as_string("hello").c_str() <
+ TestFixture::as_string("world"));
+}
+
+TYPED_TEST(CommonStringPieceTest, HeterogenousStringPieceEquals) {
+ TypeParam hello(TestFixture::as_string("hello"));
+
+ ASSERT_EQ(BasicStringPiece<TypeParam>(hello), hello);
+ ASSERT_EQ(hello.c_str(), BasicStringPiece<TypeParam>(hello));
+}
+
+// string16-specific stuff
+TEST(StringPiece16Test, CheckSTL) {
+ // Check some non-ascii characters.
+ string16 fifth(ASCIIToUTF16("123"));
+ fifth.push_back(0x0000);
+ fifth.push_back(0xd8c5);
+ fifth.push_back(0xdffe);
+ StringPiece16 f(fifth);
+
+ ASSERT_EQ(f[3], '\0');
+ ASSERT_EQ(f[5], static_cast<char16>(0xdffe));
+
+ ASSERT_EQ(f.size(), 6U);
+}
+
+
+
+TEST(StringPiece16Test, CheckConversion) {
+ // Make sure that we can convert from UTF8 to UTF16 and back. We use a two
+ // byte character (G clef) to test this.
+ ASSERT_EQ(
+ UTF16ToUTF8(
+ StringPiece16(UTF8ToUTF16("\xf0\x9d\x84\x9e")).as_string()),
+ "\xf0\x9d\x84\x9e");
+}
+
+TYPED_TEST(CommonStringPieceTest, CheckConstructors) {
+ TypeParam str(TestFixture::as_string("hello world"));
+ TypeParam empty;
+
+ ASSERT_EQ(str, BasicStringPiece<TypeParam>(str));
+ ASSERT_EQ(str, BasicStringPiece<TypeParam>(str.c_str()));
+ ASSERT_TRUE(TestFixture::as_string("hello") ==
+ BasicStringPiece<TypeParam>(str.c_str(), 5));
+ ASSERT_EQ(
+ empty,
+ BasicStringPiece<TypeParam>(
+ str.c_str(),
+ static_cast<typename BasicStringPiece<TypeParam>::size_type>(0)));
+ ASSERT_EQ(empty, BasicStringPiece<TypeParam>(nullptr));
+ ASSERT_TRUE(
+ empty ==
+ BasicStringPiece<TypeParam>(
+ nullptr,
+ static_cast<typename BasicStringPiece<TypeParam>::size_type>(0)));
+ ASSERT_EQ(empty, BasicStringPiece<TypeParam>());
+ ASSERT_EQ(str, BasicStringPiece<TypeParam>(str.begin(), str.end()));
+ ASSERT_EQ(empty, BasicStringPiece<TypeParam>(str.begin(), str.begin()));
+ ASSERT_EQ(empty, BasicStringPiece<TypeParam>(empty));
+ ASSERT_EQ(empty, BasicStringPiece<TypeParam>(empty.begin(), empty.end()));
+}
+
+TEST(StringPieceTest, ConstexprCtor) {
+ {
+ constexpr StringPiece piece;
+ std::ignore = piece;
+ }
+
+ {
+ constexpr StringPiece piece("abc");
+ std::ignore = piece;
+ }
+
+ {
+ constexpr StringPiece piece("abc", 2);
+ std::ignore = piece;
+ }
+}
+
+TEST(StringPieceTest, OutOfBoundsDeath) {
+ {
+ constexpr StringPiece piece;
+ ASSERT_DEATH_IF_SUPPORTED(piece[0], "");
+ }
+
+ {
+ constexpr StringPiece piece;
+ ASSERT_DEATH_IF_SUPPORTED(piece.front(), "");
+ }
+
+ {
+ constexpr StringPiece piece;
+ ASSERT_DEATH_IF_SUPPORTED(piece.back(), "");
+ }
+
+ {
+ StringPiece piece;
+ ASSERT_DEATH_IF_SUPPORTED(piece.remove_suffix(1), "");
+ }
+
+ {
+ StringPiece piece;
+ ASSERT_DEATH_IF_SUPPORTED(piece.remove_prefix(1), "");
+ }
+}
+
+TEST(StringPieceTest, ConstexprData) {
+ {
+ constexpr StringPiece piece;
+ static_assert(piece.data() == nullptr, "");
+ }
+
+ {
+ constexpr StringPiece piece("abc");
+ static_assert(piece.data()[0] == 'a', "");
+ static_assert(piece.data()[1] == 'b', "");
+ static_assert(piece.data()[2] == 'c', "");
+ }
+
+ {
+ constexpr StringPiece piece("def", 2);
+ static_assert(piece.data()[0] == 'd', "");
+ static_assert(piece.data()[1] == 'e', "");
+ }
+}
+
+TEST(StringPieceTest, ConstexprSize) {
+ {
+ constexpr StringPiece piece;
+ static_assert(piece.size() == 0, "");
+ }
+
+ {
+ constexpr StringPiece piece("abc");
+ static_assert(piece.size() == 3, "");
+ }
+
+ {
+ constexpr StringPiece piece("def", 2);
+ static_assert(piece.size() == 2, "");
+ }
+}
+
+TEST(StringPieceTest, Compare) {
+ constexpr StringPiece piece = "def";
+
+ static_assert(piece.compare("ab") == 1, "");
+ static_assert(piece.compare("abc") == 1, "");
+ static_assert(piece.compare("abcd") == 1, "");
+ static_assert(piece.compare("de") == 1, "");
+ static_assert(piece.compare("def") == 0, "");
+ static_assert(piece.compare("defg") == -1, "");
+ static_assert(piece.compare("gh") == -1, "");
+ static_assert(piece.compare("ghi") == -1, "");
+ static_assert(piece.compare("ghij") == -1, "");
+}
+
+TEST(StringPieceTest, StartsWith) {
+ constexpr StringPiece piece("abc");
+
+ static_assert(piece.starts_with(""), "");
+ static_assert(piece.starts_with("a"), "");
+ static_assert(piece.starts_with("ab"), "");
+ static_assert(piece.starts_with("abc"), "");
+
+ static_assert(!piece.starts_with("b"), "");
+ static_assert(!piece.starts_with("bc"), "");
+
+ static_assert(!piece.starts_with("abcd"), "");
+}
+
+TEST(StringPieceTest, EndsWith) {
+ constexpr StringPiece piece("abc");
+
+ static_assert(piece.ends_with(""), "");
+ static_assert(piece.ends_with("c"), "");
+ static_assert(piece.ends_with("bc"), "");
+ static_assert(piece.ends_with("abc"), "");
+
+ static_assert(!piece.ends_with("a"), "");
+ static_assert(!piece.ends_with("ab"), "");
+
+ static_assert(!piece.ends_with("abcd"), "");
+}
+
+} // namespace base
diff --git a/base/strings/string_split.cc b/base/strings/string_split.cc
new file mode 100644
index 0000000..ef9c74d
--- /dev/null
+++ b/base/strings/string_split.cc
@@ -0,0 +1,277 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_split.h"
+
+#include <stddef.h>
+
+#include "polyfills/base/logging.h"
+#include "base/strings/string_util.h"
+#include "base/third_party/icu/icu_utf.h"
+
+namespace gurl_base {
+
+namespace {
+
+// PieceToOutputType converts a StringPiece as needed to a given output type,
+// which is either the same type of StringPiece (a NOP) or the corresponding
+// non-piece string type.
+//
+// The default converter is a NOP, it works when the OutputType is the
+// correct StringPiece.
+template<typename Str, typename OutputType>
+OutputType PieceToOutputType(BasicStringPiece<Str> piece) {
+ return piece;
+}
+template<> // Convert StringPiece to std::string
+std::string PieceToOutputType<std::string, std::string>(StringPiece piece) {
+ return piece.as_string();
+}
+template<> // Convert StringPiece16 to string16.
+string16 PieceToOutputType<string16, string16>(StringPiece16 piece) {
+ return piece.as_string();
+}
+
+// Returns either the ASCII or UTF-16 whitespace.
+template<typename Str> BasicStringPiece<Str> WhitespaceForType();
+template<> StringPiece16 WhitespaceForType<string16>() {
+ return kWhitespaceUTF16;
+}
+template<> StringPiece WhitespaceForType<std::string>() {
+ return kWhitespaceASCII;
+}
+
+// Optimize the single-character case to call find() on the string instead,
+// since this is the common case and can be made faster. This could have been
+// done with template specialization too, but would have been less clear.
+//
+// There is no corresponding FindFirstNotOf because StringPiece already
+// implements these different versions that do the optimized searching.
+size_t FindFirstOf(StringPiece piece, char c, size_t pos) {
+ return piece.find(c, pos);
+}
+size_t FindFirstOf(StringPiece16 piece, char16 c, size_t pos) {
+ return piece.find(c, pos);
+}
+size_t FindFirstOf(StringPiece piece, StringPiece one_of, size_t pos) {
+ return piece.find_first_of(one_of, pos);
+}
+size_t FindFirstOf(StringPiece16 piece, StringPiece16 one_of, size_t pos) {
+ return piece.find_first_of(one_of, pos);
+}
+
+// General string splitter template. Can take 8- or 16-bit input, can produce
+// the corresponding string or StringPiece output, and can take single- or
+// multiple-character delimiters.
+//
+// DelimiterType is either a character (Str::value_type) or a string piece of
+// multiple characters (BasicStringPiece<Str>). StringPiece has a version of
+// find for both of these cases, and the single-character version is the most
+// common and can be implemented faster, which is why this is a template.
+template<typename Str, typename OutputStringType, typename DelimiterType>
+static std::vector<OutputStringType> SplitStringT(
+ BasicStringPiece<Str> str,
+ DelimiterType delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ std::vector<OutputStringType> result;
+ if (str.empty())
+ return result;
+
+ size_t start = 0;
+ while (start != Str::npos) {
+ size_t end = FindFirstOf(str, delimiter, start);
+
+ BasicStringPiece<Str> piece;
+ if (end == Str::npos) {
+ piece = str.substr(start);
+ start = Str::npos;
+ } else {
+ piece = str.substr(start, end - start);
+ start = end + 1;
+ }
+
+ if (whitespace == TRIM_WHITESPACE)
+ piece = TrimString(piece, WhitespaceForType<Str>(), TRIM_ALL);
+
+ if (result_type == SPLIT_WANT_ALL || !piece.empty())
+ result.push_back(PieceToOutputType<Str, OutputStringType>(piece));
+ }
+ return result;
+}
+
+bool AppendStringKeyValue(StringPiece input,
+ char delimiter,
+ StringPairs* result) {
+ // Always append a new item regardless of success (it might be empty). The
+ // below code will copy the strings directly into the result pair.
+ result->resize(result->size() + 1);
+ auto& result_pair = result->back();
+
+ // Find the delimiter.
+ size_t end_key_pos = input.find_first_of(delimiter);
+ if (end_key_pos == std::string::npos) {
+ DVLOG(1) << "cannot find delimiter in: " << input;
+ return false; // No delimiter.
+ }
+ input.substr(0, end_key_pos).CopyToString(&result_pair.first);
+
+ // Find the value string.
+ StringPiece remains = input.substr(end_key_pos, input.size() - end_key_pos);
+ size_t begin_value_pos = remains.find_first_not_of(delimiter);
+ if (begin_value_pos == StringPiece::npos) {
+ DVLOG(1) << "cannot parse value from input: " << input;
+ return false; // No value.
+ }
+ remains.substr(begin_value_pos, remains.size() - begin_value_pos)
+ .CopyToString(&result_pair.second);
+
+ return true;
+}
+
+template <typename Str, typename OutputStringType>
+void SplitStringUsingSubstrT(BasicStringPiece<Str> input,
+ BasicStringPiece<Str> delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type,
+ std::vector<OutputStringType>* result) {
+ using Piece = BasicStringPiece<Str>;
+ using size_type = typename Piece::size_type;
+
+ result->clear();
+ for (size_type begin_index = 0, end_index = 0; end_index != Piece::npos;
+ begin_index = end_index + delimiter.size()) {
+ end_index = input.find(delimiter, begin_index);
+ Piece term = end_index == Piece::npos
+ ? input.substr(begin_index)
+ : input.substr(begin_index, end_index - begin_index);
+
+ if (whitespace == TRIM_WHITESPACE)
+ term = TrimString(term, WhitespaceForType<Str>(), TRIM_ALL);
+
+ if (result_type == SPLIT_WANT_ALL || !term.empty())
+ result->push_back(PieceToOutputType<Str, OutputStringType>(term));
+ }
+}
+
+} // namespace
+
+std::vector<std::string> SplitString(StringPiece input,
+ StringPiece separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ if (separators.size() == 1) {
+ return SplitStringT<std::string, std::string, char>(
+ input, separators[0], whitespace, result_type);
+ }
+ return SplitStringT<std::string, std::string, StringPiece>(
+ input, separators, whitespace, result_type);
+}
+
+std::vector<string16> SplitString(StringPiece16 input,
+ StringPiece16 separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ if (separators.size() == 1) {
+ return SplitStringT<string16, string16, char16>(
+ input, separators[0], whitespace, result_type);
+ }
+ return SplitStringT<string16, string16, StringPiece16>(
+ input, separators, whitespace, result_type);
+}
+
+std::vector<StringPiece> SplitStringPiece(StringPiece input,
+ StringPiece separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ if (separators.size() == 1) {
+ return SplitStringT<std::string, StringPiece, char>(
+ input, separators[0], whitespace, result_type);
+ }
+ return SplitStringT<std::string, StringPiece, StringPiece>(
+ input, separators, whitespace, result_type);
+}
+
+std::vector<StringPiece16> SplitStringPiece(StringPiece16 input,
+ StringPiece16 separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ if (separators.size() == 1) {
+ return SplitStringT<string16, StringPiece16, char16>(
+ input, separators[0], whitespace, result_type);
+ }
+ return SplitStringT<string16, StringPiece16, StringPiece16>(
+ input, separators, whitespace, result_type);
+}
+
+bool SplitStringIntoKeyValuePairs(StringPiece input,
+ char key_value_delimiter,
+ char key_value_pair_delimiter,
+ StringPairs* key_value_pairs) {
+ return SplitStringIntoKeyValuePairsUsingSubstr(
+ input, key_value_delimiter, StringPiece(&key_value_pair_delimiter, 1),
+ key_value_pairs);
+}
+
+bool SplitStringIntoKeyValuePairsUsingSubstr(
+ StringPiece input,
+ char key_value_delimiter,
+ StringPiece key_value_pair_delimiter,
+ StringPairs* key_value_pairs) {
+ key_value_pairs->clear();
+
+ std::vector<StringPiece> pairs = SplitStringPieceUsingSubstr(
+ input, key_value_pair_delimiter, TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);
+ key_value_pairs->reserve(pairs.size());
+
+ bool success = true;
+ for (const StringPiece& pair : pairs) {
+ if (!AppendStringKeyValue(pair, key_value_delimiter, key_value_pairs)) {
+ // Don't return here, to allow for pairs without associated
+ // value or key; just record that the split failed.
+ success = false;
+ }
+ }
+ return success;
+}
+
+std::vector<string16> SplitStringUsingSubstr(StringPiece16 input,
+ StringPiece16 delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ std::vector<string16> result;
+ SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
+ return result;
+}
+
+std::vector<std::string> SplitStringUsingSubstr(StringPiece input,
+ StringPiece delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ std::vector<std::string> result;
+ SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
+ return result;
+}
+
+std::vector<StringPiece16> SplitStringPieceUsingSubstr(
+ StringPiece16 input,
+ StringPiece16 delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ std::vector<StringPiece16> result;
+ SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
+ return result;
+}
+
+std::vector<StringPiece> SplitStringPieceUsingSubstr(
+ StringPiece input,
+ StringPiece delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type) {
+ std::vector<StringPiece> result;
+ SplitStringUsingSubstrT(input, delimiter, whitespace, result_type, &result);
+ return result;
+}
+
+} // namespace base
diff --git a/base/strings/string_split.h b/base/strings/string_split.h
new file mode 100644
index 0000000..1894d05
--- /dev/null
+++ b/base/strings/string_split.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING_SPLIT_H_
+#define BASE_STRINGS_STRING_SPLIT_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+enum WhitespaceHandling {
+ KEEP_WHITESPACE,
+ TRIM_WHITESPACE,
+};
+
+enum SplitResult {
+ // Strictly return all results.
+ //
+ // If the input is ",," and the separator is ',' this will return a
+ // vector of three empty strings.
+ SPLIT_WANT_ALL,
+
+ // Only nonempty results will be added to the results. Multiple separators
+ // will be coalesced. Separators at the beginning and end of the input will
+ // be ignored. With TRIM_WHITESPACE, whitespace-only results will be dropped.
+ //
+ // If the input is ",," and the separator is ',', this will return an empty
+ // vector.
+ SPLIT_WANT_NONEMPTY,
+};
+
+// Split the given string on ANY of the given separators, returning copies of
+// the result.
+//
+// To split on either commas or semicolons, keeping all whitespace:
+//
+// std::vector<std::string> tokens = gurl_base::SplitString(
+// input, ",;", gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+BASE_EXPORT std::vector<std::string> SplitString(
+ StringPiece input,
+ StringPiece separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+BASE_EXPORT std::vector<string16> SplitString(
+ StringPiece16 input,
+ StringPiece16 separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+
+// Like SplitString above except it returns a vector of StringPieces which
+// reference the original buffer without copying. Although you have to be
+// careful to keep the original string unmodified, this provides an efficient
+// way to iterate through tokens in a string.
+//
+// To iterate through all whitespace-separated tokens in an input string:
+//
+// for (const auto& cur :
+// gurl_base::SplitStringPiece(input, gurl_base::kWhitespaceASCII,
+// gurl_base::KEEP_WHITESPACE,
+// gurl_base::SPLIT_WANT_NONEMPTY)) {
+// ...
+BASE_EXPORT std::vector<StringPiece> SplitStringPiece(
+ StringPiece input,
+ StringPiece separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+BASE_EXPORT std::vector<StringPiece16> SplitStringPiece(
+ StringPiece16 input,
+ StringPiece16 separators,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+
+using StringPairs = std::vector<std::pair<std::string, std::string>>;
+
+// Splits |line| into key value pairs according to the given delimiters and
+// removes whitespace leading each key and trailing each value. Returns true
+// only if each pair has a non-empty key and value. |key_value_pairs| will
+// include ("","") pairs for entries without |key_value_delimiter|.
+BASE_EXPORT bool SplitStringIntoKeyValuePairs(StringPiece input,
+ char key_value_delimiter,
+ char key_value_pair_delimiter,
+ StringPairs* key_value_pairs);
+
+// Similar to SplitStringIntoKeyValuePairs, but use a substring
+// |key_value_pair_delimiter| instead of a single char.
+BASE_EXPORT bool SplitStringIntoKeyValuePairsUsingSubstr(
+ StringPiece input,
+ char key_value_delimiter,
+ StringPiece key_value_pair_delimiter,
+ StringPairs* key_value_pairs);
+
+// Similar to SplitString, but use a substring delimiter instead of a list of
+// characters that are all possible delimiters.
+BASE_EXPORT std::vector<string16> SplitStringUsingSubstr(
+ StringPiece16 input,
+ StringPiece16 delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+BASE_EXPORT std::vector<std::string> SplitStringUsingSubstr(
+ StringPiece input,
+ StringPiece delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+
+// Like SplitStringUsingSubstr above except it returns a vector of StringPieces
+// which reference the original buffer without copying. Although you have to be
+// careful to keep the original string unmodified, this provides an efficient
+// way to iterate through tokens in a string.
+//
+// To iterate through all newline-separated tokens in an input string:
+//
+// for (const auto& cur :
+// gurl_base::SplitStringUsingSubstr(input, "\r\n",
+// gurl_base::KEEP_WHITESPACE,
+// gurl_base::SPLIT_WANT_NONEMPTY)) {
+// ...
+BASE_EXPORT std::vector<StringPiece16> SplitStringPieceUsingSubstr(
+ StringPiece16 input,
+ StringPiece16 delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+BASE_EXPORT std::vector<StringPiece> SplitStringPieceUsingSubstr(
+ StringPiece input,
+ StringPiece delimiter,
+ WhitespaceHandling whitespace,
+ SplitResult result_type);
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_SPLIT_H_
diff --git a/base/strings/string_split_unittest.cc b/base/strings/string_split_unittest.cc
new file mode 100644
index 0000000..993450a
--- /dev/null
+++ b/base/strings/string_split_unittest.cc
@@ -0,0 +1,448 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_split.h"
+
+#include <stddef.h>
+
+#include "base/macros.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ElementsAre;
+
+namespace gurl_base {
+
+class SplitStringIntoKeyValuePairsTest : public testing::Test {
+ protected:
+ gurl_base::StringPairs kv_pairs;
+};
+
+using SplitStringIntoKeyValuePairsUsingSubstrTest =
+ SplitStringIntoKeyValuePairsTest;
+
+TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, EmptyString) {
+ EXPECT_TRUE(
+ SplitStringIntoKeyValuePairsUsingSubstr(std::string(),
+ ':', // Key-value delimiter
+ ",", // Key-value pair delimiter
+ &kv_pairs));
+ EXPECT_TRUE(kv_pairs.empty());
+}
+
+TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, MissingKeyValueDelimiter) {
+ EXPECT_FALSE(
+ SplitStringIntoKeyValuePairsUsingSubstr("key1,,key2:value2",
+ ':', // Key-value delimiter
+ ",,", // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_TRUE(kv_pairs[0].first.empty());
+ EXPECT_TRUE(kv_pairs[0].second.empty());
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest,
+ MissingKeyValuePairDelimeter) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairsUsingSubstr(
+ "key1:value1,,key3:value3",
+ ':', // Key-value delimiter
+ ",,,", // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(1U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("value1,,key3:value3", kv_pairs[0].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, UntrimmedWhitespace) {
+ EXPECT_TRUE(
+ SplitStringIntoKeyValuePairsUsingSubstr("key1 : value1",
+ ':', // Key-value delimiter
+ ",", // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(1U, kv_pairs.size());
+ EXPECT_EQ("key1 ", kv_pairs[0].first);
+ EXPECT_EQ(" value1", kv_pairs[0].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsUsingSubstrTest, OnlySplitAtGivenSeparator) {
+ std::string a("a ?!@#$%^&*()_+:/{}\\\t\nb");
+ EXPECT_TRUE(
+ SplitStringIntoKeyValuePairsUsingSubstr(a + "X" + a + "XY" + a + "YX" + a,
+ 'X', // Key-value delimiter
+ "XY", // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ(a, kv_pairs[0].first);
+ EXPECT_EQ(a, kv_pairs[0].second);
+ EXPECT_EQ(a + 'Y', kv_pairs[1].first);
+ EXPECT_EQ(a, kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, EmptyString) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs(std::string(),
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ EXPECT_TRUE(kv_pairs.empty());
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, MissingKeyValueDelimiter) {
+ EXPECT_FALSE(SplitStringIntoKeyValuePairs("key1,key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_TRUE(kv_pairs[0].first.empty());
+ EXPECT_TRUE(kv_pairs[0].second.empty());
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, EmptyKeyWithKeyValueDelimiter) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs(":value1,key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_TRUE(kv_pairs[0].first.empty());
+ EXPECT_EQ("value1", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, TrailingAndLeadingPairDelimiter) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs(",key1:value1,key2:value2,",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("value1", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, EmptyPair) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:value1,,key3:value3",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("value1", kv_pairs[0].second);
+ EXPECT_EQ("key3", kv_pairs[1].first);
+ EXPECT_EQ("value3", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, EmptyValue) {
+ EXPECT_FALSE(SplitStringIntoKeyValuePairs("key1:,key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, UntrimmedWhitespace) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1 : value1",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(1U, kv_pairs.size());
+ EXPECT_EQ("key1 ", kv_pairs[0].first);
+ EXPECT_EQ(" value1", kv_pairs[0].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, TrimmedWhitespace) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:value1 , key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("value1", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, MultipleKeyValueDelimiters) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:::value1,key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("value1", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST_F(SplitStringIntoKeyValuePairsTest, OnlySplitAtGivenSeparator) {
+ std::string a("a ?!@#$%^&*()_+:/{}\\\t\nb");
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs(a + "X" + a + "Y" + a + "X" + a,
+ 'X', // Key-value delimiter
+ 'Y', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ(a, kv_pairs[0].first);
+ EXPECT_EQ(a, kv_pairs[0].second);
+ EXPECT_EQ(a, kv_pairs[1].first);
+ EXPECT_EQ(a, kv_pairs[1].second);
+}
+
+
+TEST_F(SplitStringIntoKeyValuePairsTest, DelimiterInValue) {
+ EXPECT_TRUE(SplitStringIntoKeyValuePairs("key1:va:ue1,key2:value2",
+ ':', // Key-value delimiter
+ ',', // Key-value pair delimiter
+ &kv_pairs));
+ ASSERT_EQ(2U, kv_pairs.size());
+ EXPECT_EQ("key1", kv_pairs[0].first);
+ EXPECT_EQ("va:ue1", kv_pairs[0].second);
+ EXPECT_EQ("key2", kv_pairs[1].first);
+ EXPECT_EQ("value2", kv_pairs[1].second);
+}
+
+TEST(SplitStringUsingSubstrTest, EmptyString) {
+ std::vector<std::string> results = SplitStringUsingSubstr(
+ std::string(), "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(1u, results.size());
+ EXPECT_THAT(results, ElementsAre(""));
+}
+
+TEST(StringUtilTest, SplitString_Basics) {
+ std::vector<std::string> r;
+
+ r = SplitString(std::string(), ",:;", KEEP_WHITESPACE, SPLIT_WANT_ALL);
+ EXPECT_TRUE(r.empty());
+
+ // Empty separator list
+ r = SplitString("hello, world", "", KEEP_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(1u, r.size());
+ EXPECT_EQ("hello, world", r[0]);
+
+ // Should split on any of the separators.
+ r = SplitString("::,,;;", ",:;", KEEP_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(7u, r.size());
+ for (auto str : r)
+ ASSERT_TRUE(str.empty());
+
+ r = SplitString("red, green; blue:", ",:;", TRIM_WHITESPACE,
+ SPLIT_WANT_NONEMPTY);
+ ASSERT_EQ(3u, r.size());
+ EXPECT_EQ("red", r[0]);
+ EXPECT_EQ("green", r[1]);
+ EXPECT_EQ("blue", r[2]);
+
+ // Want to split a string along whitespace sequences.
+ r = SplitString(" red green \tblue\n", " \t\n", TRIM_WHITESPACE,
+ SPLIT_WANT_NONEMPTY);
+ ASSERT_EQ(3u, r.size());
+ EXPECT_EQ("red", r[0]);
+ EXPECT_EQ("green", r[1]);
+ EXPECT_EQ("blue", r[2]);
+
+ // Weird case of splitting on spaces but not trimming.
+ r = SplitString(" red ", " ", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(3u, r.size());
+ EXPECT_EQ("", r[0]); // Before the first space.
+ EXPECT_EQ("red", r[1]);
+ EXPECT_EQ("", r[2]); // After the last space.
+}
+
+TEST(StringUtilTest, SplitString_WhitespaceAndResultType) {
+ std::vector<std::string> r;
+
+ // Empty input handling.
+ r = SplitString(std::string(), ",", KEEP_WHITESPACE, SPLIT_WANT_ALL);
+ EXPECT_TRUE(r.empty());
+ r = SplitString(std::string(), ",", KEEP_WHITESPACE, SPLIT_WANT_NONEMPTY);
+ EXPECT_TRUE(r.empty());
+
+ // Input string is space and we're trimming.
+ r = SplitString(" ", ",", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(1u, r.size());
+ EXPECT_EQ("", r[0]);
+ r = SplitString(" ", ",", TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);
+ EXPECT_TRUE(r.empty());
+
+ // Test all 4 combinations of flags on ", ,".
+ r = SplitString(", ,", ",", KEEP_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(3u, r.size());
+ EXPECT_EQ("", r[0]);
+ EXPECT_EQ(" ", r[1]);
+ EXPECT_EQ("", r[2]);
+ r = SplitString(", ,", ",", KEEP_WHITESPACE, SPLIT_WANT_NONEMPTY);
+ ASSERT_EQ(1u, r.size());
+ ASSERT_EQ(" ", r[0]);
+ r = SplitString(", ,", ",", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(3u, r.size());
+ EXPECT_EQ("", r[0]);
+ EXPECT_EQ("", r[1]);
+ EXPECT_EQ("", r[2]);
+ r = SplitString(", ,", ",", TRIM_WHITESPACE, SPLIT_WANT_NONEMPTY);
+ ASSERT_TRUE(r.empty());
+}
+
+TEST(SplitStringUsingSubstrTest, StringWithNoDelimiter) {
+ std::vector<std::string> results = SplitStringUsingSubstr(
+ "alongwordwithnodelimiter", "DELIMITER", TRIM_WHITESPACE,
+ SPLIT_WANT_ALL);
+ ASSERT_EQ(1u, results.size());
+ EXPECT_THAT(results, ElementsAre("alongwordwithnodelimiter"));
+}
+
+TEST(SplitStringUsingSubstrTest, LeadingDelimitersSkipped) {
+ std::vector<std::string> results = SplitStringUsingSubstr(
+ "DELIMITERDELIMITERDELIMITERoneDELIMITERtwoDELIMITERthree",
+ "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(6u, results.size());
+ EXPECT_THAT(results, ElementsAre("", "", "", "one", "two", "three"));
+}
+
+TEST(SplitStringUsingSubstrTest, ConsecutiveDelimitersSkipped) {
+ std::vector<std::string> results = SplitStringUsingSubstr(
+ "unoDELIMITERDELIMITERDELIMITERdosDELIMITERtresDELIMITERDELIMITERcuatro",
+ "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(7u, results.size());
+ EXPECT_THAT(results, ElementsAre("uno", "", "", "dos", "tres", "", "cuatro"));
+}
+
+TEST(SplitStringUsingSubstrTest, TrailingDelimitersSkipped) {
+ std::vector<std::string> results = SplitStringUsingSubstr(
+ "unDELIMITERdeuxDELIMITERtroisDELIMITERquatreDELIMITERDELIMITERDELIMITER",
+ "DELIMITER", TRIM_WHITESPACE, SPLIT_WANT_ALL);
+ ASSERT_EQ(7u, results.size());
+ EXPECT_THAT(
+ results, ElementsAre("un", "deux", "trois", "quatre", "", "", ""));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, StringWithNoDelimiter) {
+ std::vector<gurl_base::StringPiece> results =
+ SplitStringPieceUsingSubstr("alongwordwithnodelimiter", "DELIMITER",
+ gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(1u, results.size());
+ EXPECT_THAT(results, ElementsAre("alongwordwithnodelimiter"));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, LeadingDelimitersSkipped) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "DELIMITERDELIMITERDELIMITERoneDELIMITERtwoDELIMITERthree", "DELIMITER",
+ gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(6u, results.size());
+ EXPECT_THAT(results, ElementsAre("", "", "", "one", "two", "three"));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, ConsecutiveDelimitersSkipped) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "unoDELIMITERDELIMITERDELIMITERdosDELIMITERtresDELIMITERDELIMITERcuatro",
+ "DELIMITER", gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(7u, results.size());
+ EXPECT_THAT(results, ElementsAre("uno", "", "", "dos", "tres", "", "cuatro"));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, TrailingDelimitersSkipped) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "unDELIMITERdeuxDELIMITERtroisDELIMITERquatreDELIMITERDELIMITERDELIMITER",
+ "DELIMITER", gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(7u, results.size());
+ EXPECT_THAT(results,
+ ElementsAre("un", "deux", "trois", "quatre", "", "", ""));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, KeepWhitespace) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "un DELIMITERdeux\tDELIMITERtrois\nDELIMITERquatre", "DELIMITER",
+ gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(4u, results.size());
+ EXPECT_THAT(results, ElementsAre("un ", "deux\t", "trois\n", "quatre"));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, TrimWhitespace) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "un DELIMITERdeux\tDELIMITERtrois\nDELIMITERquatre", "DELIMITER",
+ gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(4u, results.size());
+ EXPECT_THAT(results, ElementsAre("un", "deux", "trois", "quatre"));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, SplitWantAll) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "unDELIMITERdeuxDELIMITERtroisDELIMITERDELIMITER", "DELIMITER",
+ gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(5u, results.size());
+ EXPECT_THAT(results, ElementsAre("un", "deux", "trois", "", ""));
+}
+
+TEST(SplitStringPieceUsingSubstrTest, SplitWantNonEmpty) {
+ std::vector<gurl_base::StringPiece> results = SplitStringPieceUsingSubstr(
+ "unDELIMITERdeuxDELIMITERtroisDELIMITERDELIMITER", "DELIMITER",
+ gurl_base::TRIM_WHITESPACE, gurl_base::SPLIT_WANT_NONEMPTY);
+ ASSERT_EQ(3u, results.size());
+ EXPECT_THAT(results, ElementsAre("un", "deux", "trois"));
+}
+
+TEST(StringSplitTest, StringSplitKeepWhitespace) {
+ std::vector<std::string> r;
+
+ r = SplitString(" ", "*", gurl_base::KEEP_WHITESPACE, gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(1U, r.size());
+ EXPECT_EQ(r[0], " ");
+
+ r = SplitString("\t \ta\t ", "\t", gurl_base::KEEP_WHITESPACE,
+ gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(4U, r.size());
+ EXPECT_EQ(r[0], "");
+ EXPECT_EQ(r[1], " ");
+ EXPECT_EQ(r[2], "a");
+ EXPECT_EQ(r[3], " ");
+
+ r = SplitString("\ta\t\nb\tcc", "\n", gurl_base::KEEP_WHITESPACE,
+ gurl_base::SPLIT_WANT_ALL);
+ ASSERT_EQ(2U, r.size());
+ EXPECT_EQ(r[0], "\ta\t");
+ EXPECT_EQ(r[1], "b\tcc");
+}
+
+TEST(StringSplitTest, SplitStringAlongWhitespace) {
+ struct TestData {
+ const char* input;
+ const size_t expected_result_count;
+ const char* output1;
+ const char* output2;
+ } data[] = {
+ { "a", 1, "a", "" },
+ { " ", 0, "", "" },
+ { " a", 1, "a", "" },
+ { " ab ", 1, "ab", "" },
+ { " ab c", 2, "ab", "c" },
+ { " ab c ", 2, "ab", "c" },
+ { " ab cd", 2, "ab", "cd" },
+ { " ab cd ", 2, "ab", "cd" },
+ { " \ta\t", 1, "a", "" },
+ { " b\ta\t", 2, "b", "a" },
+ { " b\tat", 2, "b", "at" },
+ { "b\tat", 2, "b", "at" },
+ { "b\t at", 2, "b", "at" },
+ };
+ for (const auto& i : data) {
+ std::vector<std::string> results =
+ gurl_base::SplitString(i.input, kWhitespaceASCII, gurl_base::KEEP_WHITESPACE,
+ gurl_base::SPLIT_WANT_NONEMPTY);
+ ASSERT_EQ(i.expected_result_count, results.size());
+ if (i.expected_result_count > 0)
+ ASSERT_EQ(i.output1, results[0]);
+ if (i.expected_result_count > 1)
+ ASSERT_EQ(i.output2, results[1]);
+ }
+}
+
+} // namespace base
diff --git a/base/strings/string_tokenizer.h b/base/strings/string_tokenizer.h
new file mode 100644
index 0000000..7ee0178
--- /dev/null
+++ b/base/strings/string_tokenizer.h
@@ -0,0 +1,303 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING_TOKENIZER_H_
+#define BASE_STRINGS_STRING_TOKENIZER_H_
+
+#include <algorithm>
+#include <string>
+
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+// StringTokenizerT is a simple string tokenizer class. It works like an
+// iterator that with each step (see the Advance method) updates members that
+// refer to the next token in the input string. The user may optionally
+// configure the tokenizer to return delimiters.
+//
+// EXAMPLE 1:
+//
+// char input[] = "this is a test";
+// CStringTokenizer t(input, input + strlen(input), " ");
+// while (t.GetNext()) {
+// printf("%s\n", t.token().c_str());
+// }
+//
+// Output:
+//
+// this
+// is
+// a
+// test
+//
+//
+// EXAMPLE 2:
+//
+// std::string input = "no-cache=\"foo, bar\", private";
+// StringTokenizer t(input, ", ");
+// t.set_quote_chars("\"");
+// while (t.GetNext()) {
+// printf("%s\n", t.token().c_str());
+// }
+//
+// Output:
+//
+// no-cache="foo, bar"
+// private
+//
+//
+// EXAMPLE 3:
+//
+// bool next_is_option = false, next_is_value = false;
+// std::string input = "text/html; charset=UTF-8; foo=bar";
+// StringTokenizer t(input, "; =");
+// t.set_options(StringTokenizer::RETURN_DELIMS);
+// while (t.GetNext()) {
+// if (t.token_is_delim()) {
+// switch (*t.token_begin()) {
+// case ';':
+// next_is_option = true;
+// break;
+// case '=':
+// next_is_value = true;
+// break;
+// }
+// } else {
+// const char* label;
+// if (next_is_option) {
+// label = "option-name";
+// next_is_option = false;
+// } else if (next_is_value) {
+// label = "option-value";
+// next_is_value = false;
+// } else {
+// label = "mime-type";
+// }
+// printf("%s: %s\n", label, t.token().c_str());
+// }
+// }
+//
+//
+template <class str, class const_iterator>
+class StringTokenizerT {
+ public:
+ typedef typename str::value_type char_type;
+
+ // Options that may be pass to set_options()
+ enum {
+ // Specifies the delimiters should be returned as tokens
+ RETURN_DELIMS = 1 << 0,
+
+ // Specifies that empty tokens should be returned. Treats the beginning and
+ // ending of the string as implicit delimiters, though doesn't return them
+ // as tokens if RETURN_DELIMS is also used.
+ RETURN_EMPTY_TOKENS = 1 << 1,
+ };
+
+ // The string object must live longer than the tokenizer. In particular, this
+ // should not be constructed with a temporary. The deleted rvalue constructor
+ // blocks the most obvious instances of this (e.g. passing a string literal to
+ // the constructor), but caution must still be exercised.
+ StringTokenizerT(const str& string,
+ const str& delims) {
+ Init(string.begin(), string.end(), delims);
+ }
+
+ // Don't allow temporary strings to be used with string tokenizer, since
+ // Init() would otherwise save iterators to a temporary string.
+ StringTokenizerT(str&&, const str& delims) = delete;
+
+ StringTokenizerT(const_iterator string_begin,
+ const_iterator string_end,
+ const str& delims) {
+ Init(string_begin, string_end, delims);
+ }
+
+ // Set the options for this tokenizer. By default, this is 0.
+ void set_options(int options) { options_ = options; }
+
+ // Set the characters to regard as quotes. By default, this is empty. When
+ // a quote char is encountered, the tokenizer will switch into a mode where
+ // it ignores delimiters that it finds. It switches out of this mode once it
+ // finds another instance of the quote char. If a backslash is encountered
+ // within a quoted string, then the next character is skipped.
+ void set_quote_chars(const str& quotes) { quotes_ = quotes; }
+
+ // Call this method to advance the tokenizer to the next delimiter. This
+ // returns false if the tokenizer is complete. This method must be called
+ // before calling any of the token* methods.
+ bool GetNext() {
+ if (quotes_.empty() && options_ == 0)
+ return QuickGetNext();
+ else
+ return FullGetNext();
+ }
+
+ // Start iterating through tokens from the beginning of the string.
+ void Reset() {
+ token_end_ = start_pos_;
+ }
+
+ // Returns true if token is a delimiter. When the tokenizer is constructed
+ // with the RETURN_DELIMS option, this method can be used to check if the
+ // returned token is actually a delimiter. Returns true before the first
+ // time GetNext() has been called, and after GetNext() returns false.
+ bool token_is_delim() const { return token_is_delim_; }
+
+ // If GetNext() returned true, then these methods may be used to read the
+ // value of the token.
+ const_iterator token_begin() const { return token_begin_; }
+ const_iterator token_end() const { return token_end_; }
+ str token() const { return str(token_begin_, token_end_); }
+ BasicStringPiece<str> token_piece() const {
+ return BasicStringPiece<str>(&*token_begin_,
+ std::distance(token_begin_, token_end_));
+ }
+
+ private:
+ void Init(const_iterator string_begin,
+ const_iterator string_end,
+ const str& delims) {
+ start_pos_ = string_begin;
+ token_begin_ = string_begin;
+ token_end_ = string_begin;
+ end_ = string_end;
+ delims_ = delims;
+ options_ = 0;
+ token_is_delim_ = true;
+ }
+
+ // Implementation of GetNext() for when we have no quote characters. We have
+ // two separate implementations because AdvanceOne() is a hot spot in large
+ // text files with large tokens.
+ bool QuickGetNext() {
+ token_is_delim_ = false;
+ for (;;) {
+ token_begin_ = token_end_;
+ if (token_end_ == end_) {
+ token_is_delim_ = true;
+ return false;
+ }
+ ++token_end_;
+ if (delims_.find(*token_begin_) == str::npos)
+ break;
+ // else skip over delimiter.
+ }
+ while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
+ ++token_end_;
+ return true;
+ }
+
+ // Implementation of GetNext() for when we have to take quotes into account.
+ bool FullGetNext() {
+ AdvanceState state;
+
+ for (;;) {
+ if (token_is_delim_) {
+ // Last token was a delimiter. Note: This is also the case at the start.
+ //
+ // ... D T T T T D ...
+ // ^ ^
+ // | |
+ // | |token_end_| : The next character to look at or |end_|.
+ // |
+ // |token_begin_| : Points to delimiter or |token_end_|.
+ //
+ // The next token is always a non-delimiting token. It could be empty,
+ // however.
+ token_is_delim_ = false;
+ token_begin_ = token_end_;
+
+ // Slurp all non-delimiter characters into the token.
+ while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) {
+ ++token_end_;
+ }
+
+ // If it's non-empty, or empty tokens were requested, return the token.
+ if (token_begin_ != token_end_ || (options_ & RETURN_EMPTY_TOKENS))
+ return true;
+ }
+
+ GURL_DCHECK(!token_is_delim_);
+ // Last token was a regular token.
+ //
+ // ... T T T D T T ...
+ // ^ ^
+ // | |
+ // | token_end_ : The next character to look at. Always one
+ // | char beyond the token boundary.
+ // |
+ // token_begin_ : Points to beginning of token. Note: token could
+ // be empty, in which case
+ // token_begin_ == token_end_.
+ //
+ // The next token is always a delimiter. It could be |end_| however, but
+ // |end_| is also an implicit delimiter.
+ token_is_delim_ = true;
+ token_begin_ = token_end_;
+
+ if (token_end_ == end_)
+ return false;
+
+ // Look at the delimiter.
+ ++token_end_;
+ if (options_ & RETURN_DELIMS)
+ return true;
+ }
+
+ return false;
+ }
+
+ bool IsDelim(char_type c) const {
+ return delims_.find(c) != str::npos;
+ }
+
+ bool IsQuote(char_type c) const {
+ return quotes_.find(c) != str::npos;
+ }
+
+ struct AdvanceState {
+ bool in_quote;
+ bool in_escape;
+ char_type quote_char;
+ AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
+ };
+
+ // Returns true if a delimiter was not hit.
+ bool AdvanceOne(AdvanceState* state, char_type c) {
+ if (state->in_quote) {
+ if (state->in_escape) {
+ state->in_escape = false;
+ } else if (c == '\\') {
+ state->in_escape = true;
+ } else if (c == state->quote_char) {
+ state->in_quote = false;
+ }
+ } else {
+ if (IsDelim(c))
+ return false;
+ state->in_quote = IsQuote(state->quote_char = c);
+ }
+ return true;
+ }
+
+ const_iterator start_pos_;
+ const_iterator token_begin_;
+ const_iterator token_end_;
+ const_iterator end_;
+ str delims_;
+ str quotes_;
+ int options_;
+ bool token_is_delim_;
+};
+
+typedef StringTokenizerT<std::string, std::string::const_iterator>
+ StringTokenizer;
+typedef StringTokenizerT<string16, string16::const_iterator> String16Tokenizer;
+typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_TOKENIZER_H_
diff --git a/base/strings/string_tokenizer_fuzzer.cc b/base/strings/string_tokenizer_fuzzer.cc
new file mode 100644
index 0000000..3aaee7b
--- /dev/null
+++ b/base/strings/string_tokenizer_fuzzer.cc
@@ -0,0 +1,59 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "base/strings/string_tokenizer.h"
+
+void GetAllTokens(gurl_base::StringTokenizer& t) {
+ while (t.GetNext()) {
+ (void)t.token();
+ }
+}
+
+// Entry point for LibFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ uint8_t size_t_bytes = sizeof(size_t);
+ if (size < size_t_bytes + 1) {
+ return 0;
+ }
+
+ // Calculate pattern size based on remaining bytes, otherwise fuzzing is
+ // inefficient with bailouts in most cases.
+ size_t pattern_size =
+ *reinterpret_cast<const size_t*>(data) % (size - size_t_bytes);
+
+ std::string pattern(reinterpret_cast<const char*>(data + size_t_bytes),
+ pattern_size);
+ std::string input(
+ reinterpret_cast<const char*>(data + size_t_bytes + pattern_size),
+ size - pattern_size - size_t_bytes);
+
+ // Allow quote_chars and options to be set. Otherwise full coverage
+ // won't be possible since IsQuote, FullGetNext and other functions
+ // won't be called.
+ for (bool return_delims : {false, true}) {
+ for (bool return_empty_strings : {false, true}) {
+ int options = 0;
+ if (return_delims)
+ options |= gurl_base::StringTokenizer::RETURN_DELIMS;
+ if (return_empty_strings)
+ options |= gurl_base::StringTokenizer::RETURN_EMPTY_TOKENS;
+
+ gurl_base::StringTokenizer t(input, pattern);
+ t.set_options(options);
+ GetAllTokens(t);
+
+ gurl_base::StringTokenizer t_quote(input, pattern);
+ t_quote.set_quote_chars("\"");
+ t_quote.set_options(options);
+ GetAllTokens(t_quote);
+ }
+ }
+
+ return 0;
+}
diff --git a/base/strings/string_tokenizer_unittest.cc b/base/strings/string_tokenizer_unittest.cc
new file mode 100644
index 0000000..1665d5d
--- /dev/null
+++ b/base/strings/string_tokenizer_unittest.cc
@@ -0,0 +1,387 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_tokenizer.h"
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+using std::string;
+
+namespace gurl_base {
+
+namespace {
+
+TEST(StringTokenizerTest, Simple) {
+ string input = "this is a test";
+ StringTokenizer t(input, " ");
+ // The start of string, before returning any tokens, is considered a
+ // delimiter.
+ EXPECT_TRUE(t.token_is_delim());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("this", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("is", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("a", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("test", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+ // The end of string, after the last token tokens, is considered a delimiter.
+ EXPECT_TRUE(t.token_is_delim());
+}
+
+TEST(StringTokenizerTest, Reset) {
+ string input = "this is a test";
+ StringTokenizer t(input, " ");
+
+ for (int i = 0; i < 2; ++i) {
+ EXPECT_TRUE(t.token_is_delim());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("this", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("is", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("a", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("test", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+
+ t.Reset();
+ }
+}
+
+TEST(StringTokenizerTest, RetDelims) {
+ string input = "this is a test";
+ StringTokenizer t(input, " ");
+ t.set_options(StringTokenizer::RETURN_DELIMS);
+ EXPECT_TRUE(t.token_is_delim());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("this", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("is", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("a", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("test", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+}
+
+TEST(StringTokenizerTest, RetEmptyTokens) {
+ string input = "foo='a, b',,bar,,baz,quux";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
+ t.set_quote_chars("'");
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("foo='a, b'", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("baz", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("quux", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, RetEmptyTokens_AtStart) {
+ string input = ",bar";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
+ t.set_quote_chars("'");
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, RetEmptyTokens_AtEnd) {
+ string input = "bar,";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
+ t.set_quote_chars("'");
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, RetEmptyTokens_Both) {
+ string input = ",";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
+ t.set_quote_chars("'");
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, RetEmptyTokens_Empty) {
+ string input = "";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_EMPTY_TOKENS);
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, RetDelimsAndEmptyTokens) {
+ string input = "foo='a, b',,bar,,baz,quux";
+ StringTokenizer t(input, ",");
+ t.set_options(StringTokenizer::RETURN_DELIMS |
+ StringTokenizer::RETURN_EMPTY_TOKENS);
+ t.set_quote_chars("'");
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("foo='a, b'", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ(",", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ(",", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ(",", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ(",", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("baz", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ(",", t.token());
+
+ ASSERT_TRUE(t.GetNext());
+ EXPECT_EQ("quux", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ManyDelims) {
+ string input = "this: is, a-test";
+ StringTokenizer t(input, ": ,-");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("this", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("is", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("a", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("test", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ParseHeader) {
+ string input = "Content-Type: text/html ; charset=UTF-8";
+ StringTokenizer t(input, ": ;=");
+ t.set_options(StringTokenizer::RETURN_DELIMS);
+ EXPECT_TRUE(t.token_is_delim());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("Content-Type", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(":", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("text/html", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(";", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ(" ", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("charset", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+ EXPECT_EQ("=", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_FALSE(t.token_is_delim());
+ EXPECT_EQ("UTF-8", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+ EXPECT_TRUE(t.token_is_delim());
+}
+
+TEST(StringTokenizerTest, ParseQuotedString) {
+ string input = "foo bar 'hello world' baz";
+ StringTokenizer t(input, " ");
+ t.set_quote_chars("'");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("foo", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("'hello world'", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("baz", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ParseQuotedString_Malformed) {
+ string input = "bar 'hello wo";
+ StringTokenizer t(input, " ");
+ t.set_quote_chars("'");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("'hello wo", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ParseQuotedString_Multiple) {
+ string input = "bar 'hel\"lo\" wo' baz\"";
+ StringTokenizer t(input, " ");
+ t.set_quote_chars("'\"");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("'hel\"lo\" wo'", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("baz\"", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes) {
+ string input = "foo 'don\\'t do that'";
+ StringTokenizer t(input, " ");
+ t.set_quote_chars("'");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("foo", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("'don\\'t do that'", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+TEST(StringTokenizerTest, ParseQuotedString_EscapedQuotes2) {
+ string input = "foo='a, b', bar";
+ StringTokenizer t(input, ", ");
+ t.set_quote_chars("'");
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("foo='a, b'", t.token());
+
+ EXPECT_TRUE(t.GetNext());
+ EXPECT_EQ("bar", t.token());
+
+ EXPECT_FALSE(t.GetNext());
+}
+
+} // namespace
+
+} // namespace base
diff --git a/base/strings/string_util.cc b/base/strings/string_util.cc
new file mode 100644
index 0000000..2b2591d
--- /dev/null
+++ b/base/strings/string_util.cc
@@ -0,0 +1,1119 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "polyfills/base/logging.h"
+#include "base/no_destructor.h"
+#include "base/stl_util.h"
+#include "base/strings/utf_string_conversion_utils.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/third_party/icu/icu_utf.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+namespace {
+
+// Used by ReplaceStringPlaceholders to track the position in the string of
+// replaced parameters.
+struct ReplacementOffset {
+ ReplacementOffset(uintptr_t parameter, size_t offset)
+ : parameter(parameter),
+ offset(offset) {}
+
+ // Index of the parameter.
+ uintptr_t parameter;
+
+ // Starting position in the string.
+ size_t offset;
+};
+
+static bool CompareParameter(const ReplacementOffset& elem1,
+ const ReplacementOffset& elem2) {
+ return elem1.parameter < elem2.parameter;
+}
+
+// Overloaded function to append one string onto the end of another. Having a
+// separate overload for |source| as both string and StringPiece allows for more
+// efficient usage from functions templated to work with either type (avoiding a
+// redundant call to the BasicStringPiece constructor in both cases).
+template <typename string_type>
+inline void AppendToString(string_type* target, const string_type& source) {
+ target->append(source);
+}
+
+template <typename string_type>
+inline void AppendToString(string_type* target,
+ const BasicStringPiece<string_type>& source) {
+ source.AppendToString(target);
+}
+
+// Assuming that a pointer is the size of a "machine word", then
+// uintptr_t is an integer type that is also a machine word.
+using MachineWord = uintptr_t;
+
+inline bool IsMachineWordAligned(const void* pointer) {
+ return !(reinterpret_cast<MachineWord>(pointer) & (sizeof(MachineWord) - 1));
+}
+
+template <typename CharacterType>
+struct NonASCIIMask;
+template <>
+struct NonASCIIMask<char> {
+ static constexpr MachineWord value() {
+ return static_cast<MachineWord>(0x8080808080808080ULL);
+ }
+};
+template <>
+struct NonASCIIMask<char16> {
+ static constexpr MachineWord value() {
+ return static_cast<MachineWord>(0xFF80FF80FF80FF80ULL);
+ }
+};
+#if defined(WCHAR_T_IS_UTF32)
+template <>
+struct NonASCIIMask<wchar_t> {
+ static constexpr MachineWord value() {
+ return static_cast<MachineWord>(0xFFFFFF80FFFFFF80ULL);
+ }
+};
+#endif // WCHAR_T_IS_UTF32
+
+} // namespace
+
+bool IsWprintfFormatPortable(const wchar_t* format) {
+ for (const wchar_t* position = format; *position != '\0'; ++position) {
+ if (*position == '%') {
+ bool in_specification = true;
+ bool modifier_l = false;
+ while (in_specification) {
+ // Eat up characters until reaching a known specifier.
+ if (*++position == '\0') {
+ // The format string ended in the middle of a specification. Call
+ // it portable because no unportable specifications were found. The
+ // string is equally broken on all platforms.
+ return true;
+ }
+
+ if (*position == 'l') {
+ // 'l' is the only thing that can save the 's' and 'c' specifiers.
+ modifier_l = true;
+ } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
+ *position == 'S' || *position == 'C' || *position == 'F' ||
+ *position == 'D' || *position == 'O' || *position == 'U') {
+ // Not portable.
+ return false;
+ }
+
+ if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
+ // Portable, keep scanning the rest of the format string.
+ in_specification = false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+namespace {
+
+template<typename StringType>
+StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
+ StringType ret;
+ ret.reserve(str.size());
+ for (size_t i = 0; i < str.size(); i++)
+ ret.push_back(ToLowerASCII(str[i]));
+ return ret;
+}
+
+template<typename StringType>
+StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
+ StringType ret;
+ ret.reserve(str.size());
+ for (size_t i = 0; i < str.size(); i++)
+ ret.push_back(ToUpperASCII(str[i]));
+ return ret;
+}
+
+} // namespace
+
+std::string ToLowerASCII(StringPiece str) {
+ return ToLowerASCIIImpl<std::string>(str);
+}
+
+string16 ToLowerASCII(StringPiece16 str) {
+ return ToLowerASCIIImpl<string16>(str);
+}
+
+std::string ToUpperASCII(StringPiece str) {
+ return ToUpperASCIIImpl<std::string>(str);
+}
+
+string16 ToUpperASCII(StringPiece16 str) {
+ return ToUpperASCIIImpl<string16>(str);
+}
+
+template<class StringType>
+int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a,
+ BasicStringPiece<StringType> b) {
+ // Find the first characters that aren't equal and compare them. If the end
+ // of one of the strings is found before a nonequal character, the lengths
+ // of the strings are compared.
+ size_t i = 0;
+ while (i < a.length() && i < b.length()) {
+ typename StringType::value_type lower_a = ToLowerASCII(a[i]);
+ typename StringType::value_type lower_b = ToLowerASCII(b[i]);
+ if (lower_a < lower_b)
+ return -1;
+ if (lower_a > lower_b)
+ return 1;
+ i++;
+ }
+
+ // End of one string hit before finding a different character. Expect the
+ // common case to be "strings equal" at this point so check that first.
+ if (a.length() == b.length())
+ return 0;
+
+ if (a.length() < b.length())
+ return -1;
+ return 1;
+}
+
+int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) {
+ return CompareCaseInsensitiveASCIIT<std::string>(a, b);
+}
+
+int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) {
+ return CompareCaseInsensitiveASCIIT<string16>(a, b);
+}
+
+bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) {
+ if (a.length() != b.length())
+ return false;
+ return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
+}
+
+bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) {
+ if (a.length() != b.length())
+ return false;
+ return CompareCaseInsensitiveASCIIT<string16>(a, b) == 0;
+}
+
+const std::string& EmptyString() {
+ static const gurl_base::NoDestructor<std::string> s;
+ return *s;
+}
+
+const string16& EmptyString16() {
+ static const gurl_base::NoDestructor<string16> s16;
+ return *s16;
+}
+
+template <class StringType>
+bool ReplaceCharsT(const StringType& input,
+ BasicStringPiece<StringType> find_any_of_these,
+ BasicStringPiece<StringType> replace_with,
+ StringType* output);
+
+bool ReplaceChars(const string16& input,
+ StringPiece16 replace_chars,
+ const string16& replace_with,
+ string16* output) {
+ return ReplaceCharsT(input, replace_chars, StringPiece16(replace_with),
+ output);
+}
+
+bool ReplaceChars(const std::string& input,
+ StringPiece replace_chars,
+ const std::string& replace_with,
+ std::string* output) {
+ return ReplaceCharsT(input, replace_chars, StringPiece(replace_with), output);
+}
+
+bool RemoveChars(const string16& input,
+ StringPiece16 remove_chars,
+ string16* output) {
+ return ReplaceCharsT(input, remove_chars, StringPiece16(), output);
+}
+
+bool RemoveChars(const std::string& input,
+ StringPiece remove_chars,
+ std::string* output) {
+ return ReplaceCharsT(input, remove_chars, StringPiece(), output);
+}
+
+template<typename Str>
+TrimPositions TrimStringT(const Str& input,
+ BasicStringPiece<Str> trim_chars,
+ TrimPositions positions,
+ Str* output) {
+ // Find the edges of leading/trailing whitespace as desired. Need to use
+ // a StringPiece version of input to be able to call find* on it with the
+ // StringPiece version of trim_chars (normally the trim_chars will be a
+ // constant so avoid making a copy).
+ BasicStringPiece<Str> input_piece(input);
+ const size_t last_char = input.length() - 1;
+ const size_t first_good_char = (positions & TRIM_LEADING) ?
+ input_piece.find_first_not_of(trim_chars) : 0;
+ const size_t last_good_char = (positions & TRIM_TRAILING) ?
+ input_piece.find_last_not_of(trim_chars) : last_char;
+
+ // When the string was all trimmed, report that we stripped off characters
+ // from whichever position the caller was interested in. For empty input, we
+ // stripped no characters, but we still need to clear |output|.
+ if (input.empty() ||
+ (first_good_char == Str::npos) || (last_good_char == Str::npos)) {
+ bool input_was_empty = input.empty(); // in case output == &input
+ output->clear();
+ return input_was_empty ? TRIM_NONE : positions;
+ }
+
+ // Trim.
+ *output =
+ input.substr(first_good_char, last_good_char - first_good_char + 1);
+
+ // Return where we trimmed from.
+ return static_cast<TrimPositions>(
+ ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
+ ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
+}
+
+bool TrimString(const string16& input,
+ StringPiece16 trim_chars,
+ string16* output) {
+ return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
+}
+
+bool TrimString(const std::string& input,
+ StringPiece trim_chars,
+ std::string* output) {
+ return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
+}
+
+template<typename Str>
+BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input,
+ BasicStringPiece<Str> trim_chars,
+ TrimPositions positions) {
+ size_t begin = (positions & TRIM_LEADING) ?
+ input.find_first_not_of(trim_chars) : 0;
+ size_t end = (positions & TRIM_TRAILING) ?
+ input.find_last_not_of(trim_chars) + 1 : input.size();
+ return input.substr(begin, end - begin);
+}
+
+StringPiece16 TrimString(StringPiece16 input,
+ StringPiece16 trim_chars,
+ TrimPositions positions) {
+ return TrimStringPieceT(input, trim_chars, positions);
+}
+
+StringPiece TrimString(StringPiece input,
+ StringPiece trim_chars,
+ TrimPositions positions) {
+ return TrimStringPieceT(input, trim_chars, positions);
+}
+
+void TruncateUTF8ToByteSize(const std::string& input,
+ const size_t byte_size,
+ std::string* output) {
+ GURL_DCHECK(output);
+ if (byte_size > input.length()) {
+ *output = input;
+ return;
+ }
+ GURL_DCHECK_LE(byte_size,
+ static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+ // Note: This cast is necessary because CBU8_NEXT uses int32_ts.
+ int32_t truncation_length = static_cast<int32_t>(byte_size);
+ int32_t char_index = truncation_length - 1;
+ const char* data = input.data();
+
+ // Using CBU8, we will move backwards from the truncation point
+ // to the beginning of the string looking for a valid UTF8
+ // character. Once a full UTF8 character is found, we will
+ // truncate the string to the end of that character.
+ while (char_index >= 0) {
+ int32_t prev = char_index;
+ base_icu::UChar32 code_point = 0;
+ CBU8_NEXT(data, char_index, truncation_length, code_point);
+ if (!IsValidCharacter(code_point) ||
+ !IsValidCodepoint(code_point)) {
+ char_index = prev - 1;
+ } else {
+ break;
+ }
+ }
+
+ if (char_index >= 0 )
+ *output = input.substr(0, char_index);
+ else
+ output->clear();
+}
+
+TrimPositions TrimWhitespace(const string16& input,
+ TrimPositions positions,
+ string16* output) {
+ return TrimStringT(input, StringPiece16(kWhitespaceUTF16), positions, output);
+}
+
+StringPiece16 TrimWhitespace(StringPiece16 input,
+ TrimPositions positions) {
+ return TrimStringPieceT(input, StringPiece16(kWhitespaceUTF16), positions);
+}
+
+TrimPositions TrimWhitespaceASCII(const std::string& input,
+ TrimPositions positions,
+ std::string* output) {
+ return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output);
+}
+
+StringPiece TrimWhitespaceASCII(StringPiece input, TrimPositions positions) {
+ return TrimStringPieceT(input, StringPiece(kWhitespaceASCII), positions);
+}
+
+template<typename STR>
+STR CollapseWhitespaceT(const STR& text,
+ bool trim_sequences_with_line_breaks) {
+ STR result;
+ result.resize(text.size());
+
+ // Set flags to pretend we're already in a trimmed whitespace sequence, so we
+ // will trim any leading whitespace.
+ bool in_whitespace = true;
+ bool already_trimmed = true;
+
+ int chars_written = 0;
+ for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
+ if (IsUnicodeWhitespace(*i)) {
+ if (!in_whitespace) {
+ // Reduce all whitespace sequences to a single space.
+ in_whitespace = true;
+ result[chars_written++] = L' ';
+ }
+ if (trim_sequences_with_line_breaks && !already_trimmed &&
+ ((*i == '\n') || (*i == '\r'))) {
+ // Whitespace sequences containing CR or LF are eliminated entirely.
+ already_trimmed = true;
+ --chars_written;
+ }
+ } else {
+ // Non-whitespace chracters are copied straight across.
+ in_whitespace = false;
+ already_trimmed = false;
+ result[chars_written++] = *i;
+ }
+ }
+
+ if (in_whitespace && !already_trimmed) {
+ // Any trailing whitespace is eliminated.
+ --chars_written;
+ }
+
+ result.resize(chars_written);
+ return result;
+}
+
+string16 CollapseWhitespace(const string16& text,
+ bool trim_sequences_with_line_breaks) {
+ return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
+}
+
+std::string CollapseWhitespaceASCII(const std::string& text,
+ bool trim_sequences_with_line_breaks) {
+ return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
+}
+
+bool ContainsOnlyChars(StringPiece input, StringPiece characters) {
+ return input.find_first_not_of(characters) == StringPiece::npos;
+}
+
+bool ContainsOnlyChars(StringPiece16 input, StringPiece16 characters) {
+ return input.find_first_not_of(characters) == StringPiece16::npos;
+}
+
+template <class Char>
+inline bool DoIsStringASCII(const Char* characters, size_t length) {
+ if (!length)
+ return true;
+ constexpr MachineWord non_ascii_bit_mask = NonASCIIMask<Char>::value();
+ MachineWord all_char_bits = 0;
+ const Char* end = characters + length;
+
+ // Prologue: align the input.
+ while (!IsMachineWordAligned(characters) && characters < end)
+ all_char_bits |= *characters++;
+ if (all_char_bits & non_ascii_bit_mask)
+ return false;
+
+ // Compare the values of CPU word size.
+ constexpr size_t chars_per_word = sizeof(MachineWord) / sizeof(Char);
+ constexpr int batch_count = 16;
+ while (characters <= end - batch_count * chars_per_word) {
+ all_char_bits = 0;
+ for (int i = 0; i < batch_count; ++i) {
+ all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
+ characters += chars_per_word;
+ }
+ if (all_char_bits & non_ascii_bit_mask)
+ return false;
+ }
+
+ // Process the remaining words.
+ all_char_bits = 0;
+ while (characters <= end - chars_per_word) {
+ all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
+ characters += chars_per_word;
+ }
+
+ // Process the remaining bytes.
+ while (characters < end)
+ all_char_bits |= *characters++;
+
+ return !(all_char_bits & non_ascii_bit_mask);
+}
+
+bool IsStringASCII(StringPiece str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+
+bool IsStringASCII(StringPiece16 str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool IsStringASCII(WStringPiece str) {
+ return DoIsStringASCII(str.data(), str.length());
+}
+#endif
+
+bool IsStringUTF8(StringPiece str) {
+ const char *src = str.data();
+ int32_t src_len = static_cast<int32_t>(str.length());
+ int32_t char_index = 0;
+
+ while (char_index < src_len) {
+ int32_t code_point;
+ CBU8_NEXT(src, char_index, src_len, code_point);
+ if (!IsValidCharacter(code_point))
+ return false;
+ }
+ return true;
+}
+
+// Implementation note: Normally this function will be called with a hardcoded
+// constant for the lowercase_ascii parameter. Constructing a StringPiece from
+// a C constant requires running strlen, so the result will be two passes
+// through the buffers, one to file the length of lowercase_ascii, and one to
+// compare each letter.
+//
+// This function could have taken a const char* to avoid this and only do one
+// pass through the string. But the strlen is faster than the case-insensitive
+// compares and lets us early-exit in the case that the strings are different
+// lengths (will often be the case for non-matches). So whether one approach or
+// the other will be faster depends on the case.
+//
+// The hardcoded strings are typically very short so it doesn't matter, and the
+// string piece gives additional flexibility for the caller (doesn't have to be
+// null terminated) so we choose the StringPiece route.
+template<typename Str>
+static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
+ StringPiece lowercase_ascii) {
+ if (str.size() != lowercase_ascii.size())
+ return false;
+ for (size_t i = 0; i < str.size(); i++) {
+ if (ToLowerASCII(str[i]) != lowercase_ascii[i])
+ return false;
+ }
+ return true;
+}
+
+bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) {
+ return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
+}
+
+bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) {
+ return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii);
+}
+
+bool EqualsASCII(StringPiece16 str, StringPiece ascii) {
+ if (str.length() != ascii.length())
+ return false;
+ return std::equal(ascii.begin(), ascii.end(), str.begin());
+}
+
+template<typename Str>
+bool StartsWithT(BasicStringPiece<Str> str,
+ BasicStringPiece<Str> search_for,
+ CompareCase case_sensitivity) {
+ if (search_for.size() > str.size())
+ return false;
+
+ BasicStringPiece<Str> source = str.substr(0, search_for.size());
+
+ switch (case_sensitivity) {
+ case CompareCase::SENSITIVE:
+ return source == search_for;
+
+ case CompareCase::INSENSITIVE_ASCII:
+ return std::equal(
+ search_for.begin(), search_for.end(),
+ source.begin(),
+ CaseInsensitiveCompareASCII<typename Str::value_type>());
+
+ default:
+ GURL_NOTREACHED();
+ return false;
+ }
+}
+
+bool StartsWith(StringPiece str,
+ StringPiece search_for,
+ CompareCase case_sensitivity) {
+ return StartsWithT<std::string>(str, search_for, case_sensitivity);
+}
+
+bool StartsWith(StringPiece16 str,
+ StringPiece16 search_for,
+ CompareCase case_sensitivity) {
+ return StartsWithT<string16>(str, search_for, case_sensitivity);
+}
+
+template <typename Str>
+bool EndsWithT(BasicStringPiece<Str> str,
+ BasicStringPiece<Str> search_for,
+ CompareCase case_sensitivity) {
+ if (search_for.size() > str.size())
+ return false;
+
+ BasicStringPiece<Str> source = str.substr(str.size() - search_for.size(),
+ search_for.size());
+
+ switch (case_sensitivity) {
+ case CompareCase::SENSITIVE:
+ return source == search_for;
+
+ case CompareCase::INSENSITIVE_ASCII:
+ return std::equal(
+ source.begin(), source.end(),
+ search_for.begin(),
+ CaseInsensitiveCompareASCII<typename Str::value_type>());
+
+ default:
+ GURL_NOTREACHED();
+ return false;
+ }
+}
+
+bool EndsWith(StringPiece str,
+ StringPiece search_for,
+ CompareCase case_sensitivity) {
+ return EndsWithT<std::string>(str, search_for, case_sensitivity);
+}
+
+bool EndsWith(StringPiece16 str,
+ StringPiece16 search_for,
+ CompareCase case_sensitivity) {
+ return EndsWithT<string16>(str, search_for, case_sensitivity);
+}
+
+char HexDigitToInt(wchar_t c) {
+ GURL_DCHECK(IsHexDigit(c));
+ if (c >= '0' && c <= '9')
+ return static_cast<char>(c - '0');
+ if (c >= 'A' && c <= 'F')
+ return static_cast<char>(c - 'A' + 10);
+ if (c >= 'a' && c <= 'f')
+ return static_cast<char>(c - 'a' + 10);
+ return 0;
+}
+
+bool IsUnicodeWhitespace(wchar_t c) {
+ // kWhitespaceWide is a NULL-terminated string
+ for (const wchar_t* cur = kWhitespaceWide; *cur; ++cur) {
+ if (*cur == c)
+ return true;
+ }
+ return false;
+}
+
+static const char* const kByteStringsUnlocalized[] = {
+ " B",
+ " kB",
+ " MB",
+ " GB",
+ " TB",
+ " PB"
+};
+
+string16 FormatBytesUnlocalized(int64_t bytes) {
+ double unit_amount = static_cast<double>(bytes);
+ size_t dimension = 0;
+ const int kKilo = 1024;
+ while (unit_amount >= kKilo &&
+ dimension < gurl_base::size(kByteStringsUnlocalized) - 1) {
+ unit_amount /= kKilo;
+ dimension++;
+ }
+
+ char buf[64];
+ if (bytes != 0 && dimension > 0 && unit_amount < 100) {
+ gurl_base::snprintf(buf, gurl_base::size(buf), "%.1lf%s", unit_amount,
+ kByteStringsUnlocalized[dimension]);
+ } else {
+ gurl_base::snprintf(buf, gurl_base::size(buf), "%.0lf%s", unit_amount,
+ kByteStringsUnlocalized[dimension]);
+ }
+
+ return ASCIIToUTF16(buf);
+}
+
+// A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
+template <class StringType>
+struct SubstringMatcher {
+ BasicStringPiece<StringType> find_this;
+
+ size_t Find(const StringType& input, size_t pos) {
+ return input.find(find_this.data(), pos, find_this.length());
+ }
+ size_t MatchSize() { return find_this.length(); }
+};
+
+// A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
+template <class StringType>
+struct CharacterMatcher {
+ BasicStringPiece<StringType> find_any_of_these;
+
+ size_t Find(const StringType& input, size_t pos) {
+ return input.find_first_of(find_any_of_these.data(), pos,
+ find_any_of_these.length());
+ }
+ constexpr size_t MatchSize() { return 1; }
+};
+
+enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
+
+// Runs in O(n) time in the length of |str|, and transforms the string without
+// reallocating when possible. Returns |true| if any matches were found.
+//
+// This is parameterized on a |Matcher| traits type, so that it can be the
+// implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
+template <class StringType, class Matcher>
+bool DoReplaceMatchesAfterOffset(StringType* str,
+ size_t initial_offset,
+ Matcher matcher,
+ BasicStringPiece<StringType> replace_with,
+ ReplaceType replace_type) {
+ using CharTraits = typename StringType::traits_type;
+
+ const size_t find_length = matcher.MatchSize();
+ if (!find_length)
+ return false;
+
+ // If the find string doesn't appear, there's nothing to do.
+ size_t first_match = matcher.Find(*str, initial_offset);
+ if (first_match == StringType::npos)
+ return false;
+
+ // If we're only replacing one instance, there's no need to do anything
+ // complicated.
+ const size_t replace_length = replace_with.length();
+ if (replace_type == ReplaceType::REPLACE_FIRST) {
+ str->replace(first_match, find_length, replace_with.data(), replace_length);
+ return true;
+ }
+
+ // If the find and replace strings are the same length, we can simply use
+ // replace() on each instance, and finish the entire operation in O(n) time.
+ if (find_length == replace_length) {
+ auto* buffer = &((*str)[0]);
+ for (size_t offset = first_match; offset != StringType::npos;
+ offset = matcher.Find(*str, offset + replace_length)) {
+ CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
+ }
+ return true;
+ }
+
+ // Since the find and replace strings aren't the same length, a loop like the
+ // one above would be O(n^2) in the worst case, as replace() will shift the
+ // entire remaining string each time. We need to be more clever to keep things
+ // O(n).
+ //
+ // When the string is being shortened, it's possible to just shift the matches
+ // down in one pass while finding, and truncate the length at the end of the
+ // search.
+ //
+ // If the string is being lengthened, more work is required. The strategy used
+ // here is to make two find() passes through the string. The first pass counts
+ // the number of matches to determine the new size. The second pass will
+ // either construct the new string into a new buffer (if the existing buffer
+ // lacked capacity), or else -- if there is room -- create a region of scratch
+ // space after |first_match| by shifting the tail of the string to a higher
+ // index, and doing in-place moves from the tail to lower indices thereafter.
+ size_t str_length = str->length();
+ size_t expansion = 0;
+ if (replace_length > find_length) {
+ // This operation lengthens the string; determine the new length by counting
+ // matches.
+ const size_t expansion_per_match = (replace_length - find_length);
+ size_t num_matches = 0;
+ for (size_t match = first_match; match != StringType::npos;
+ match = matcher.Find(*str, match + find_length)) {
+ expansion += expansion_per_match;
+ ++num_matches;
+ }
+ const size_t final_length = str_length + expansion;
+
+ if (str->capacity() < final_length) {
+ // If we'd have to allocate a new buffer to grow the string, build the
+ // result directly into the new allocation via append().
+ StringType src(str->get_allocator());
+ str->swap(src);
+ str->reserve(final_length);
+
+ size_t pos = 0;
+ for (size_t match = first_match;; match = matcher.Find(src, pos)) {
+ str->append(src, pos, match - pos);
+ str->append(replace_with.data(), replace_length);
+ pos = match + find_length;
+
+ // A mid-loop test/break enables skipping the final Find() call; the
+ // number of matches is known, so don't search past the last one.
+ if (!--num_matches)
+ break;
+ }
+
+ // Handle substring after the final match.
+ str->append(src, pos, str_length - pos);
+ return true;
+ }
+
+ // Prepare for the copy/move loop below -- expand the string to its final
+ // size by shifting the data after the first match to the end of the resized
+ // string.
+ size_t shift_src = first_match + find_length;
+ size_t shift_dst = shift_src + expansion;
+
+ // Big |expansion| factors (relative to |str_length|) require padding up to
+ // |shift_dst|.
+ if (shift_dst > str_length)
+ str->resize(shift_dst);
+
+ str->replace(shift_dst, str_length - shift_src, *str, shift_src,
+ str_length - shift_src);
+ str_length = final_length;
+ }
+
+ // We can alternate replacement and move operations. This won't overwrite the
+ // unsearched region of the string so long as |write_offset| <= |read_offset|;
+ // that condition is always satisfied because:
+ //
+ // (a) If the string is being shortened, |expansion| is zero and
+ // |write_offset| grows slower than |read_offset|.
+ //
+ // (b) If the string is being lengthened, |write_offset| grows faster than
+ // |read_offset|, but |expansion| is big enough so that |write_offset|
+ // will only catch up to |read_offset| at the point of the last match.
+ auto* buffer = &((*str)[0]);
+ size_t write_offset = first_match;
+ size_t read_offset = first_match + expansion;
+ do {
+ if (replace_length) {
+ CharTraits::copy(buffer + write_offset, replace_with.data(),
+ replace_length);
+ write_offset += replace_length;
+ }
+ read_offset += find_length;
+
+ // min() clamps StringType::npos (the largest unsigned value) to str_length.
+ size_t match = std::min(matcher.Find(*str, read_offset), str_length);
+
+ size_t length = match - read_offset;
+ if (length) {
+ CharTraits::move(buffer + write_offset, buffer + read_offset, length);
+ write_offset += length;
+ read_offset += length;
+ }
+ } while (read_offset < str_length);
+
+ // If we're shortening the string, truncate it now.
+ str->resize(write_offset);
+ return true;
+}
+
+template <class StringType>
+bool ReplaceCharsT(const StringType& input,
+ BasicStringPiece<StringType> find_any_of_these,
+ BasicStringPiece<StringType> replace_with,
+ StringType* output) {
+ // Commonly, this is called with output and input being the same string; in
+ // that case, this assignment is inexpensive.
+ *output = input;
+
+ return DoReplaceMatchesAfterOffset(
+ output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
+ ReplaceType::REPLACE_ALL);
+}
+
+void ReplaceFirstSubstringAfterOffset(string16* str,
+ size_t start_offset,
+ StringPiece16 find_this,
+ StringPiece16 replace_with) {
+ DoReplaceMatchesAfterOffset(str, start_offset,
+ SubstringMatcher<string16>{find_this},
+ replace_with, ReplaceType::REPLACE_FIRST);
+}
+
+void ReplaceFirstSubstringAfterOffset(std::string* str,
+ size_t start_offset,
+ StringPiece find_this,
+ StringPiece replace_with) {
+ DoReplaceMatchesAfterOffset(str, start_offset,
+ SubstringMatcher<std::string>{find_this},
+ replace_with, ReplaceType::REPLACE_FIRST);
+}
+
+void ReplaceSubstringsAfterOffset(string16* str,
+ size_t start_offset,
+ StringPiece16 find_this,
+ StringPiece16 replace_with) {
+ DoReplaceMatchesAfterOffset(str, start_offset,
+ SubstringMatcher<string16>{find_this},
+ replace_with, ReplaceType::REPLACE_ALL);
+}
+
+void ReplaceSubstringsAfterOffset(std::string* str,
+ size_t start_offset,
+ StringPiece find_this,
+ StringPiece replace_with) {
+ DoReplaceMatchesAfterOffset(str, start_offset,
+ SubstringMatcher<std::string>{find_this},
+ replace_with, ReplaceType::REPLACE_ALL);
+}
+
+template <class string_type>
+inline typename string_type::value_type* WriteIntoT(string_type* str,
+ size_t length_with_null) {
+ GURL_DCHECK_GT(length_with_null, 1u);
+ str->reserve(length_with_null);
+ str->resize(length_with_null - 1);
+ return &((*str)[0]);
+}
+
+char* WriteInto(std::string* str, size_t length_with_null) {
+ return WriteIntoT(str, length_with_null);
+}
+
+char16* WriteInto(string16* str, size_t length_with_null) {
+ return WriteIntoT(str, length_with_null);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+// Work around VC++ code-gen bug. https://crbug.com/804884
+#pragma optimize("", off)
+#endif
+
+// Generic version for all JoinString overloads. |list_type| must be a sequence
+// (std::vector or std::initializer_list) of strings/StringPieces (std::string,
+// string16, StringPiece or StringPiece16). |string_type| is either std::string
+// or string16.
+template <typename list_type, typename string_type>
+static string_type JoinStringT(const list_type& parts,
+ BasicStringPiece<string_type> sep) {
+ if (parts.size() == 0)
+ return string_type();
+
+ // Pre-allocate the eventual size of the string. Start with the size of all of
+ // the separators (note that this *assumes* parts.size() > 0).
+ size_t total_size = (parts.size() - 1) * sep.size();
+ for (const auto& part : parts)
+ total_size += part.size();
+ string_type result;
+ result.reserve(total_size);
+
+ auto iter = parts.begin();
+ GURL_DCHECK(iter != parts.end());
+ AppendToString(&result, *iter);
+ ++iter;
+
+ for (; iter != parts.end(); ++iter) {
+ sep.AppendToString(&result);
+ // Using the overloaded AppendToString allows this template function to work
+ // on both strings and StringPieces without creating an intermediate
+ // StringPiece object.
+ AppendToString(&result, *iter);
+ }
+
+ // Sanity-check that we pre-allocated correctly.
+ GURL_DCHECK_EQ(total_size, result.size());
+
+ return result;
+}
+
+std::string JoinString(const std::vector<std::string>& parts,
+ StringPiece separator) {
+ return JoinStringT(parts, separator);
+}
+
+string16 JoinString(const std::vector<string16>& parts,
+ StringPiece16 separator) {
+ return JoinStringT(parts, separator);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+// Work around VC++ code-gen bug. https://crbug.com/804884
+#pragma optimize("", on)
+#endif
+
+std::string JoinString(const std::vector<StringPiece>& parts,
+ StringPiece separator) {
+ return JoinStringT(parts, separator);
+}
+
+string16 JoinString(const std::vector<StringPiece16>& parts,
+ StringPiece16 separator) {
+ return JoinStringT(parts, separator);
+}
+
+std::string JoinString(std::initializer_list<StringPiece> parts,
+ StringPiece separator) {
+ return JoinStringT(parts, separator);
+}
+
+string16 JoinString(std::initializer_list<StringPiece16> parts,
+ StringPiece16 separator) {
+ return JoinStringT(parts, separator);
+}
+
+template<class FormatStringType, class OutStringType>
+OutStringType DoReplaceStringPlaceholders(
+ const FormatStringType& format_string,
+ const std::vector<OutStringType>& subst,
+ std::vector<size_t>* offsets) {
+ size_t substitutions = subst.size();
+ GURL_DCHECK_LT(substitutions, 10U);
+
+ size_t sub_length = 0;
+ for (const auto& cur : subst)
+ sub_length += cur.length();
+
+ OutStringType formatted;
+ formatted.reserve(format_string.length() + sub_length);
+
+ std::vector<ReplacementOffset> r_offsets;
+ for (auto i = format_string.begin(); i != format_string.end(); ++i) {
+ if ('$' == *i) {
+ if (i + 1 != format_string.end()) {
+ ++i;
+ if ('$' == *i) {
+ while (i != format_string.end() && '$' == *i) {
+ formatted.push_back('$');
+ ++i;
+ }
+ --i;
+ } else {
+ if (*i < '1' || *i > '9') {
+ GURL_DLOG(ERROR) << "Invalid placeholder: $" << *i;
+ continue;
+ }
+ uintptr_t index = *i - '1';
+ if (offsets) {
+ ReplacementOffset r_offset(index,
+ static_cast<int>(formatted.size()));
+ r_offsets.insert(
+ std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
+ &CompareParameter),
+ r_offset);
+ }
+ if (index < substitutions)
+ formatted.append(subst.at(index));
+ }
+ }
+ } else {
+ formatted.push_back(*i);
+ }
+ }
+ if (offsets) {
+ for (const auto& cur : r_offsets)
+ offsets->push_back(cur.offset);
+ }
+ return formatted;
+}
+
+string16 ReplaceStringPlaceholders(const string16& format_string,
+ const std::vector<string16>& subst,
+ std::vector<size_t>* offsets) {
+ return DoReplaceStringPlaceholders(format_string, subst, offsets);
+}
+
+std::string ReplaceStringPlaceholders(StringPiece format_string,
+ const std::vector<std::string>& subst,
+ std::vector<size_t>* offsets) {
+ return DoReplaceStringPlaceholders(format_string, subst, offsets);
+}
+
+string16 ReplaceStringPlaceholders(const string16& format_string,
+ const string16& a,
+ size_t* offset) {
+ std::vector<size_t> offsets;
+ std::vector<string16> subst;
+ subst.push_back(a);
+ string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
+
+ GURL_DCHECK_EQ(1U, offsets.size());
+ if (offset)
+ *offset = offsets[0];
+ return result;
+}
+
+// The following code is compatible with the OpenBSD lcpy interface. See:
+// http://www.gratisoft.us/todd/papers/strlcpy.html
+// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
+
+namespace {
+
+template <typename CHAR>
+size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
+ for (size_t i = 0; i < dst_size; ++i) {
+ if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL.
+ return i;
+ }
+
+ // We were left off at dst_size. We over copied 1 byte. Null terminate.
+ if (dst_size != 0)
+ dst[dst_size - 1] = 0;
+
+ // Count the rest of the |src|, and return it's length in characters.
+ while (src[dst_size]) ++dst_size;
+ return dst_size;
+}
+
+} // namespace
+
+size_t strlcpy(char* dst, const char* src, size_t dst_size) {
+ return lcpyT<char>(dst, src, dst_size);
+}
+size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
+ return lcpyT<wchar_t>(dst, src, dst_size);
+}
+
+} // namespace base
diff --git a/base/strings/string_util.h b/base/strings/string_util.h
new file mode 100644
index 0000000..5a8cb02
--- /dev/null
+++ b/base/strings/string_util.h
@@ -0,0 +1,530 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This file defines utility functions for working with strings.
+
+#ifndef BASE_STRINGS_STRING_UTIL_H_
+#define BASE_STRINGS_STRING_UTIL_H_
+
+#include <ctype.h>
+#include <stdarg.h> // va_list
+#include <stddef.h>
+#include <stdint.h>
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "polyfills/base/base_export.h"
+#include "base/compiler_specific.h"
+#include "base/stl_util.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h" // For implicit conversions.
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+// C standard-library functions that aren't cross-platform are provided as
+// "gurl_base::...", and their prototypes are listed below. These functions are
+// then implemented as inline calls to the platform-specific equivalents in the
+// platform-specific headers.
+
+// Wrapper for vsnprintf that always null-terminates and always returns the
+// number of characters that would be in an untruncated formatted
+// string, even when truncation occurs.
+int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments)
+ PRINTF_FORMAT(3, 0);
+
+// Some of these implementations need to be inlined.
+
+// We separate the declaration from the implementation of this inline
+// function just so the PRINTF_FORMAT works.
+inline int snprintf(char* buffer, size_t size, const char* format, ...)
+ PRINTF_FORMAT(3, 4);
+inline int snprintf(char* buffer, size_t size, const char* format, ...) {
+ va_list arguments;
+ va_start(arguments, format);
+ int result = vsnprintf(buffer, size, format, arguments);
+ va_end(arguments);
+ return result;
+}
+
+// BSD-style safe and consistent string copy functions.
+// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
+// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
+// long as |dst_size| is not 0. Returns the length of |src| in characters.
+// If the return value is >= dst_size, then the output was truncated.
+// NOTE: All sizes are in number of characters, NOT in bytes.
+BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size);
+BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size);
+
+// Scan a wprintf format string to determine whether it's portable across a
+// variety of systems. This function only checks that the conversion
+// specifiers used by the format string are supported and have the same meaning
+// on a variety of systems. It doesn't check for other errors that might occur
+// within a format string.
+//
+// Nonportable conversion specifiers for wprintf are:
+// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char
+// data on all systems except Windows, which treat them as wchar_t data.
+// Use %ls and %lc for wchar_t data instead.
+// - 'S' and 'C', which operate on wchar_t data on all systems except Windows,
+// which treat them as char data. Use %ls and %lc for wchar_t data
+// instead.
+// - 'F', which is not identified by Windows wprintf documentation.
+// - 'D', 'O', and 'U', which are deprecated and not available on all systems.
+// Use %ld, %lo, and %lu instead.
+//
+// Note that there is no portable conversion specifier for char data when
+// working with wprintf.
+//
+// This function is intended to be called from gurl_base::vswprintf.
+BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format);
+
+// ASCII-specific tolower. The standard library's tolower is locale sensitive,
+// so we don't want to use it here.
+inline char ToLowerASCII(char c) {
+ return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+inline char16 ToLowerASCII(char16 c) {
+ return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+
+// ASCII-specific toupper. The standard library's toupper is locale sensitive,
+// so we don't want to use it here.
+inline char ToUpperASCII(char c) {
+ return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;
+}
+inline char16 ToUpperASCII(char16 c) {
+ return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;
+}
+
+// Converts the given string to it's ASCII-lowercase equivalent.
+BASE_EXPORT std::string ToLowerASCII(StringPiece str);
+BASE_EXPORT string16 ToLowerASCII(StringPiece16 str);
+
+// Converts the given string to it's ASCII-uppercase equivalent.
+BASE_EXPORT std::string ToUpperASCII(StringPiece str);
+BASE_EXPORT string16 ToUpperASCII(StringPiece16 str);
+
+// Functor for case-insensitive ASCII comparisons for STL algorithms like
+// std::search.
+//
+// Note that a full Unicode version of this functor is not possible to write
+// because case mappings might change the number of characters, depend on
+// context (combining accents), and require handling UTF-16. If you need
+// proper Unicode support, use gurl_base::i18n::ToLower/FoldCase and then just
+// use a normal operator== on the result.
+template<typename Char> struct CaseInsensitiveCompareASCII {
+ public:
+ bool operator()(Char x, Char y) const {
+ return ToLowerASCII(x) == ToLowerASCII(y);
+ }
+};
+
+// Like strcasecmp for case-insensitive ASCII characters only. Returns:
+// -1 (a < b)
+// 0 (a == b)
+// 1 (a > b)
+// (unlike strcasecmp which can return values greater or less than 1/-1). For
+// full Unicode support, use gurl_base::i18n::ToLower or gurl_base::i18h::FoldCase
+// and then just call the normal string operators on the result.
+BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b);
+BASE_EXPORT int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b);
+
+// Equality for ASCII case-insensitive comparisons. For full Unicode support,
+// use gurl_base::i18n::ToLower or gurl_base::i18h::FoldCase and then compare with either
+// == or !=.
+BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b);
+BASE_EXPORT bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b);
+
+// These threadsafe functions return references to globally unique empty
+// strings.
+//
+// It is likely faster to construct a new empty string object (just a few
+// instructions to set the length to 0) than to get the empty string instance
+// returned by these functions (which requires threadsafe static access).
+//
+// Therefore, DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT
+// CONSTRUCTORS. There is only one case where you should use these: functions
+// which need to return a string by reference (e.g. as a class member
+// accessor), and don't have an empty string to use (e.g. in an error case).
+// These should not be used as initializers, function arguments, or return
+// values for functions which return by value or outparam.
+BASE_EXPORT const std::string& EmptyString();
+BASE_EXPORT const string16& EmptyString16();
+
+// Contains the set of characters representing whitespace in the corresponding
+// encoding. Null-terminated. The ASCII versions are the whitespaces as defined
+// by HTML5, and don't include control characters.
+BASE_EXPORT extern const wchar_t kWhitespaceWide[]; // Includes Unicode.
+BASE_EXPORT extern const char16 kWhitespaceUTF16[]; // Includes Unicode.
+BASE_EXPORT extern const char kWhitespaceASCII[];
+BASE_EXPORT extern const char16 kWhitespaceASCIIAs16[]; // No unicode.
+
+// Null-terminated string representing the UTF-8 byte order mark.
+BASE_EXPORT extern const char kUtf8ByteOrderMark[];
+
+// Removes characters in |remove_chars| from anywhere in |input|. Returns true
+// if any characters were removed. |remove_chars| must be null-terminated.
+// NOTE: Safe to use the same variable for both |input| and |output|.
+BASE_EXPORT bool RemoveChars(const string16& input,
+ StringPiece16 remove_chars,
+ string16* output);
+BASE_EXPORT bool RemoveChars(const std::string& input,
+ StringPiece remove_chars,
+ std::string* output);
+
+// Replaces characters in |replace_chars| from anywhere in |input| with
+// |replace_with|. Each character in |replace_chars| will be replaced with
+// the |replace_with| string. Returns true if any characters were replaced.
+// |replace_chars| must be null-terminated.
+// NOTE: Safe to use the same variable for both |input| and |output|.
+BASE_EXPORT bool ReplaceChars(const string16& input,
+ StringPiece16 replace_chars,
+ const string16& replace_with,
+ string16* output);
+BASE_EXPORT bool ReplaceChars(const std::string& input,
+ StringPiece replace_chars,
+ const std::string& replace_with,
+ std::string* output);
+
+enum TrimPositions {
+ TRIM_NONE = 0,
+ TRIM_LEADING = 1 << 0,
+ TRIM_TRAILING = 1 << 1,
+ TRIM_ALL = TRIM_LEADING | TRIM_TRAILING,
+};
+
+// Removes characters in |trim_chars| from the beginning and end of |input|.
+// The 8-bit version only works on 8-bit characters, not UTF-8. Returns true if
+// any characters were removed.
+//
+// It is safe to use the same variable for both |input| and |output| (this is
+// the normal usage to trim in-place).
+BASE_EXPORT bool TrimString(const string16& input,
+ StringPiece16 trim_chars,
+ string16* output);
+BASE_EXPORT bool TrimString(const std::string& input,
+ StringPiece trim_chars,
+ std::string* output);
+
+// StringPiece versions of the above. The returned pieces refer to the original
+// buffer.
+BASE_EXPORT StringPiece16 TrimString(StringPiece16 input,
+ StringPiece16 trim_chars,
+ TrimPositions positions);
+BASE_EXPORT StringPiece TrimString(StringPiece input,
+ StringPiece trim_chars,
+ TrimPositions positions);
+
+// Truncates a string to the nearest UTF-8 character that will leave
+// the string less than or equal to the specified byte size.
+BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input,
+ const size_t byte_size,
+ std::string* output);
+
+#if defined(WCHAR_T_IS_UTF16)
+// Utility functions to access the underlying string buffer as a wide char
+// pointer.
+//
+// Note: These functions violate strict aliasing when char16 and wchar_t are
+// unrelated types. We thus pass -fno-strict-aliasing to the compiler on
+// non-Windows platforms [1], and rely on it being off in Clang's CL mode [2].
+//
+// [1] https://crrev.com/b9a0976622/build/config/compiler/BUILD.gn#244
+// [2]
+// https://github.com/llvm/llvm-project/blob/1e28a66/clang/lib/Driver/ToolChains/Clang.cpp#L3949
+inline wchar_t* as_writable_wcstr(char16* str) {
+ return reinterpret_cast<wchar_t*>(str);
+}
+
+inline wchar_t* as_writable_wcstr(string16& str) {
+ return reinterpret_cast<wchar_t*>(data(str));
+}
+
+inline const wchar_t* as_wcstr(const char16* str) {
+ return reinterpret_cast<const wchar_t*>(str);
+}
+
+inline const wchar_t* as_wcstr(StringPiece16 str) {
+ return reinterpret_cast<const wchar_t*>(str.data());
+}
+
+// Utility functions to access the underlying string buffer as a char16 pointer.
+inline char16* as_writable_u16cstr(wchar_t* str) {
+ return reinterpret_cast<char16*>(str);
+}
+
+inline char16* as_writable_u16cstr(std::wstring& str) {
+ return reinterpret_cast<char16*>(data(str));
+}
+
+inline const char16* as_u16cstr(const wchar_t* str) {
+ return reinterpret_cast<const char16*>(str);
+}
+
+inline const char16* as_u16cstr(WStringPiece str) {
+ return reinterpret_cast<const char16*>(str.data());
+}
+#endif // defined(WCHAR_T_IS_UTF16)
+
+// Trims any whitespace from either end of the input string.
+//
+// The StringPiece versions return a substring referencing the input buffer.
+// The ASCII versions look only for ASCII whitespace.
+//
+// The std::string versions return where whitespace was found.
+// NOTE: Safe to use the same variable for both input and output.
+BASE_EXPORT TrimPositions TrimWhitespace(const string16& input,
+ TrimPositions positions,
+ string16* output);
+BASE_EXPORT StringPiece16 TrimWhitespace(StringPiece16 input,
+ TrimPositions positions);
+BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input,
+ TrimPositions positions,
+ std::string* output);
+BASE_EXPORT StringPiece TrimWhitespaceASCII(StringPiece input,
+ TrimPositions positions);
+
+// Searches for CR or LF characters. Removes all contiguous whitespace
+// strings that contain them. This is useful when trying to deal with text
+// copied from terminals.
+// Returns |text|, with the following three transformations:
+// (1) Leading and trailing whitespace is trimmed.
+// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace
+// sequences containing a CR or LF are trimmed.
+// (3) All other whitespace sequences are converted to single spaces.
+BASE_EXPORT string16 CollapseWhitespace(
+ const string16& text,
+ bool trim_sequences_with_line_breaks);
+BASE_EXPORT std::string CollapseWhitespaceASCII(
+ const std::string& text,
+ bool trim_sequences_with_line_breaks);
+
+// Returns true if |input| is empty or contains only characters found in
+// |characters|.
+BASE_EXPORT bool ContainsOnlyChars(StringPiece input, StringPiece characters);
+BASE_EXPORT bool ContainsOnlyChars(StringPiece16 input,
+ StringPiece16 characters);
+
+// Returns true if the specified string matches the criteria. How can a wide
+// string be 8-bit or UTF8? It contains only characters that are < 256 (in the
+// first case) or characters that use only 8-bits and whose 8-bit
+// representation looks like a UTF-8 string (the second case).
+//
+// Note that IsStringUTF8 checks not only if the input is structurally
+// valid but also if it doesn't contain any non-character codepoint
+// (e.g. U+FFFE). It's done on purpose because all the existing callers want
+// to have the maximum 'discriminating' power from other encodings. If
+// there's a use case for just checking the structural validity, we have to
+// add a new function for that.
+//
+// IsStringASCII assumes the input is likely all ASCII, and does not leave early
+// if it is not the case.
+BASE_EXPORT bool IsStringUTF8(StringPiece str);
+BASE_EXPORT bool IsStringASCII(StringPiece str);
+BASE_EXPORT bool IsStringASCII(StringPiece16 str);
+#if defined(WCHAR_T_IS_UTF32)
+BASE_EXPORT bool IsStringASCII(WStringPiece str);
+#endif
+
+// Compare the lower-case form of the given string against the given
+// previously-lower-cased ASCII string (typically a constant).
+BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece str,
+ StringPiece lowecase_ascii);
+BASE_EXPORT bool LowerCaseEqualsASCII(StringPiece16 str,
+ StringPiece lowecase_ascii);
+
+// Performs a case-sensitive string compare of the given 16-bit string against
+// the given 8-bit ASCII string (typically a constant). The behavior is
+// undefined if the |ascii| string is not ASCII.
+BASE_EXPORT bool EqualsASCII(StringPiece16 str, StringPiece ascii);
+
+// Indicates case sensitivity of comparisons. Only ASCII case insensitivity
+// is supported. Full Unicode case-insensitive conversions would need to go in
+// base/i18n so it can use ICU.
+//
+// If you need to do Unicode-aware case-insensitive StartsWith/EndsWith, it's
+// best to call gurl_base::i18n::ToLower() or gurl_base::i18n::FoldCase() (see
+// base/i18n/case_conversion.h for usage advice) on the arguments, and then use
+// the results to a case-sensitive comparison.
+enum class CompareCase {
+ SENSITIVE,
+ INSENSITIVE_ASCII,
+};
+
+BASE_EXPORT bool StartsWith(StringPiece str,
+ StringPiece search_for,
+ CompareCase case_sensitivity);
+BASE_EXPORT bool StartsWith(StringPiece16 str,
+ StringPiece16 search_for,
+ CompareCase case_sensitivity);
+BASE_EXPORT bool EndsWith(StringPiece str,
+ StringPiece search_for,
+ CompareCase case_sensitivity);
+BASE_EXPORT bool EndsWith(StringPiece16 str,
+ StringPiece16 search_for,
+ CompareCase case_sensitivity);
+
+// Determines the type of ASCII character, independent of locale (the C
+// library versions will change based on locale).
+template <typename Char>
+inline bool IsAsciiWhitespace(Char c) {
+ return c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\f';
+}
+template <typename Char>
+inline bool IsAsciiAlpha(Char c) {
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+template <typename Char>
+inline bool IsAsciiUpper(Char c) {
+ return c >= 'A' && c <= 'Z';
+}
+template <typename Char>
+inline bool IsAsciiLower(Char c) {
+ return c >= 'a' && c <= 'z';
+}
+template <typename Char>
+inline bool IsAsciiDigit(Char c) {
+ return c >= '0' && c <= '9';
+}
+template <typename Char>
+inline bool IsAsciiPrintable(Char c) {
+ return c >= ' ' && c <= '~';
+}
+
+template <typename Char>
+inline bool IsHexDigit(Char c) {
+ return (c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'F') ||
+ (c >= 'a' && c <= 'f');
+}
+
+// Returns the integer corresponding to the given hex character. For example:
+// '4' -> 4
+// 'a' -> 10
+// 'B' -> 11
+// Assumes the input is a valid hex character. DCHECKs in debug builds if not.
+BASE_EXPORT char HexDigitToInt(wchar_t c);
+
+// Returns true if it's a Unicode whitespace character.
+BASE_EXPORT bool IsUnicodeWhitespace(wchar_t c);
+
+// Return a byte string in human-readable format with a unit suffix. Not
+// appropriate for use in any UI; use of FormatBytes and friends in ui/base is
+// highly recommended instead. TODO(avi): Figure out how to get callers to use
+// FormatBytes instead; remove this.
+BASE_EXPORT string16 FormatBytesUnlocalized(int64_t bytes);
+
+// Starting at |start_offset| (usually 0), replace the first instance of
+// |find_this| with |replace_with|.
+BASE_EXPORT void ReplaceFirstSubstringAfterOffset(
+ gurl_base::string16* str,
+ size_t start_offset,
+ StringPiece16 find_this,
+ StringPiece16 replace_with);
+BASE_EXPORT void ReplaceFirstSubstringAfterOffset(
+ std::string* str,
+ size_t start_offset,
+ StringPiece find_this,
+ StringPiece replace_with);
+
+// Starting at |start_offset| (usually 0), look through |str| and replace all
+// instances of |find_this| with |replace_with|.
+//
+// This does entire substrings; use std::replace in <algorithm> for single
+// characters, for example:
+// std::replace(str.begin(), str.end(), 'a', 'b');
+BASE_EXPORT void ReplaceSubstringsAfterOffset(
+ string16* str,
+ size_t start_offset,
+ StringPiece16 find_this,
+ StringPiece16 replace_with);
+BASE_EXPORT void ReplaceSubstringsAfterOffset(
+ std::string* str,
+ size_t start_offset,
+ StringPiece find_this,
+ StringPiece replace_with);
+
+// Reserves enough memory in |str| to accommodate |length_with_null| characters,
+// sets the size of |str| to |length_with_null - 1| characters, and returns a
+// pointer to the underlying contiguous array of characters. This is typically
+// used when calling a function that writes results into a character array, but
+// the caller wants the data to be managed by a string-like object. It is
+// convenient in that is can be used inline in the call, and fast in that it
+// avoids copying the results of the call from a char* into a string.
+//
+// |length_with_null| must be at least 2, since otherwise the underlying string
+// would have size 0, and trying to access &((*str)[0]) in that case can result
+// in a number of problems.
+//
+// Internally, this takes linear time because the resize() call 0-fills the
+// underlying array for potentially all
+// (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we
+// could avoid this aspect of the resize() call, as we expect the caller to
+// immediately write over this memory, but there is no other way to set the size
+// of the string, and not doing that will mean people who access |str| rather
+// than str.c_str() will get back a string of whatever size |str| had on entry
+// to this function (probably 0).
+BASE_EXPORT char* WriteInto(std::string* str, size_t length_with_null);
+BASE_EXPORT char16* WriteInto(string16* str, size_t length_with_null);
+
+// Does the opposite of SplitString()/SplitStringPiece(). Joins a vector or list
+// of strings into a single string, inserting |separator| (which may be empty)
+// in between all elements.
+//
+// If possible, callers should build a vector of StringPieces and use the
+// StringPiece variant, so that they do not create unnecessary copies of
+// strings. For example, instead of using SplitString, modifying the vector,
+// then using JoinString, use SplitStringPiece followed by JoinString so that no
+// copies of those strings are created until the final join operation.
+//
+// Use StrCat (in base/strings/strcat.h) if you don't need a separator.
+BASE_EXPORT std::string JoinString(const std::vector<std::string>& parts,
+ StringPiece separator);
+BASE_EXPORT string16 JoinString(const std::vector<string16>& parts,
+ StringPiece16 separator);
+BASE_EXPORT std::string JoinString(const std::vector<StringPiece>& parts,
+ StringPiece separator);
+BASE_EXPORT string16 JoinString(const std::vector<StringPiece16>& parts,
+ StringPiece16 separator);
+// Explicit initializer_list overloads are required to break ambiguity when used
+// with a literal initializer list (otherwise the compiler would not be able to
+// decide between the string and StringPiece overloads).
+BASE_EXPORT std::string JoinString(std::initializer_list<StringPiece> parts,
+ StringPiece separator);
+BASE_EXPORT string16 JoinString(std::initializer_list<StringPiece16> parts,
+ StringPiece16 separator);
+
+// Replace $1-$2-$3..$9 in the format string with values from |subst|.
+// Additionally, any number of consecutive '$' characters is replaced by that
+// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be
+// NULL. This only allows you to use up to nine replacements.
+BASE_EXPORT string16 ReplaceStringPlaceholders(
+ const string16& format_string,
+ const std::vector<string16>& subst,
+ std::vector<size_t>* offsets);
+
+BASE_EXPORT std::string ReplaceStringPlaceholders(
+ StringPiece format_string,
+ const std::vector<std::string>& subst,
+ std::vector<size_t>* offsets);
+
+// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL.
+BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string,
+ const string16& a,
+ size_t* offset);
+
+} // namespace base
+
+#if defined(OS_WIN)
+#include "base/strings/string_util_win.h"
+#elif defined(OS_POSIX) || defined(OS_FUCHSIA)
+#include "base/strings/string_util_posix.h"
+#else
+#error Define string operations appropriately for your platform
+#endif
+
+#endif // BASE_STRINGS_STRING_UTIL_H_
diff --git a/base/strings/string_util_constants.cc b/base/strings/string_util_constants.cc
new file mode 100644
index 0000000..3ca29b7
--- /dev/null
+++ b/base/strings/string_util_constants.cc
@@ -0,0 +1,67 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+
+namespace gurl_base {
+
+#define WHITESPACE_UNICODE \
+ 0x0009, /* CHARACTER TABULATION */ \
+ 0x000A, /* LINE FEED (LF) */ \
+ 0x000B, /* LINE TABULATION */ \
+ 0x000C, /* FORM FEED (FF) */ \
+ 0x000D, /* CARRIAGE RETURN (CR) */ \
+ 0x0020, /* SPACE */ \
+ 0x0085, /* NEXT LINE (NEL) */ \
+ 0x00A0, /* NO-BREAK SPACE */ \
+ 0x1680, /* OGHAM SPACE MARK */ \
+ 0x2000, /* EN QUAD */ \
+ 0x2001, /* EM QUAD */ \
+ 0x2002, /* EN SPACE */ \
+ 0x2003, /* EM SPACE */ \
+ 0x2004, /* THREE-PER-EM SPACE */ \
+ 0x2005, /* FOUR-PER-EM SPACE */ \
+ 0x2006, /* SIX-PER-EM SPACE */ \
+ 0x2007, /* FIGURE SPACE */ \
+ 0x2008, /* PUNCTUATION SPACE */ \
+ 0x2009, /* THIN SPACE */ \
+ 0x200A, /* HAIR SPACE */ \
+ 0x2028, /* LINE SEPARATOR */ \
+ 0x2029, /* PARAGRAPH SEPARATOR */ \
+ 0x202F, /* NARROW NO-BREAK SPACE */ \
+ 0x205F, /* MEDIUM MATHEMATICAL SPACE */ \
+ 0x3000, /* IDEOGRAPHIC SPACE */ \
+ 0
+
+const wchar_t kWhitespaceWide[] = {
+ WHITESPACE_UNICODE
+};
+
+const char16 kWhitespaceUTF16[] = {
+ WHITESPACE_UNICODE
+};
+
+const char kWhitespaceASCII[] = {
+ 0x09, // CHARACTER TABULATION
+ 0x0A, // LINE FEED (LF)
+ 0x0B, // LINE TABULATION
+ 0x0C, // FORM FEED (FF)
+ 0x0D, // CARRIAGE RETURN (CR)
+ 0x20, // SPACE
+ 0
+};
+
+const char16 kWhitespaceASCIIAs16[] = {
+ 0x09, // CHARACTER TABULATION
+ 0x0A, // LINE FEED (LF)
+ 0x0B, // LINE TABULATION
+ 0x0C, // FORM FEED (FF)
+ 0x0D, // CARRIAGE RETURN (CR)
+ 0x20, // SPACE
+ 0
+};
+
+const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
+
+} // namespace base
diff --git a/base/strings/string_util_perftest.cc b/base/strings/string_util_perftest.cc
new file mode 100644
index 0000000..033df0e
--- /dev/null
+++ b/base/strings/string_util_perftest.cc
@@ -0,0 +1,46 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+
+#include <cinttypes>
+
+#include "base/time/time.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+template <typename String>
+void MeasureIsStringASCII(size_t str_length, size_t non_ascii_pos) {
+ String str(str_length, 'A');
+ if (non_ascii_pos < str_length)
+ str[non_ascii_pos] = '\xAF';
+
+ TimeTicks t0 = TimeTicks::Now();
+ for (size_t i = 0; i < 10000000; ++i)
+ IsStringASCII(str);
+ TimeDelta time = TimeTicks::Now() - t0;
+ printf(
+ "char-size:\t%zu\tlength:\t%zu\tnon-ascii-pos:\t%zu\ttime-ms:\t%" PRIu64
+ "\n",
+ sizeof(typename String::value_type), str_length, non_ascii_pos,
+ time.InMilliseconds());
+}
+
+TEST(StringUtilTest, DISABLED_IsStringASCIIPerf) {
+ for (size_t str_length = 4; str_length <= 1024; str_length *= 2) {
+ for (size_t non_ascii_loc = 0; non_ascii_loc < 3; ++non_ascii_loc) {
+ size_t non_ascii_pos = str_length * non_ascii_loc / 2 + 2;
+ MeasureIsStringASCII<std::string>(str_length, non_ascii_pos);
+ MeasureIsStringASCII<string16>(str_length, non_ascii_pos);
+#if defined(WCHAR_T_IS_UTF32)
+ MeasureIsStringASCII<std::basic_string<wchar_t>>(str_length,
+ non_ascii_pos);
+#endif
+ }
+ }
+}
+
+} // namespace base
diff --git a/base/strings/string_util_posix.h b/base/strings/string_util_posix.h
new file mode 100644
index 0000000..e1ba7c3
--- /dev/null
+++ b/base/strings/string_util_posix.h
@@ -0,0 +1,37 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING_UTIL_POSIX_H_
+#define BASE_STRINGS_STRING_UTIL_POSIX_H_
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
+#include "polyfills/base/logging.h"
+
+namespace gurl_base {
+
+// Chromium code style is to not use malloc'd strings; this is only for use
+// for interaction with APIs that require it.
+inline char* strdup(const char* str) {
+ return ::strdup(str);
+}
+
+inline int vsnprintf(char* buffer, size_t size,
+ const char* format, va_list arguments) {
+ return ::vsnprintf(buffer, size, format, arguments);
+}
+
+inline int vswprintf(wchar_t* buffer, size_t size,
+ const wchar_t* format, va_list arguments) {
+ GURL_DCHECK(IsWprintfFormatPortable(format));
+ return ::vswprintf(buffer, size, format, arguments);
+}
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_UTIL_POSIX_H_
diff --git a/base/strings/string_util_unittest.cc b/base/strings/string_util_unittest.cc
new file mode 100644
index 0000000..51b4ee1
--- /dev/null
+++ b/base/strings/string_util_unittest.cc
@@ -0,0 +1,1430 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+
+#include <math.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <type_traits>
+
+#include "base/stl_util.h"
+#include "base/strings/string16.h"
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ElementsAre;
+
+namespace gurl_base {
+
+static const struct trim_case {
+ const wchar_t* input;
+ const TrimPositions positions;
+ const wchar_t* output;
+ const TrimPositions return_value;
+} trim_cases[] = {
+ {L" Google Video ", TRIM_LEADING, L"Google Video ", TRIM_LEADING},
+ {L" Google Video ", TRIM_TRAILING, L" Google Video", TRIM_TRAILING},
+ {L" Google Video ", TRIM_ALL, L"Google Video", TRIM_ALL},
+ {L"Google Video", TRIM_ALL, L"Google Video", TRIM_NONE},
+ {L"", TRIM_ALL, L"", TRIM_NONE},
+ {L" ", TRIM_LEADING, L"", TRIM_LEADING},
+ {L" ", TRIM_TRAILING, L"", TRIM_TRAILING},
+ {L" ", TRIM_ALL, L"", TRIM_ALL},
+ {L"\t\rTest String\n", TRIM_ALL, L"Test String", TRIM_ALL},
+ {L"\x2002Test String\x00A0\x3000", TRIM_ALL, L"Test String", TRIM_ALL},
+};
+
+static const struct trim_case_ascii {
+ const char* input;
+ const TrimPositions positions;
+ const char* output;
+ const TrimPositions return_value;
+} trim_cases_ascii[] = {
+ {" Google Video ", TRIM_LEADING, "Google Video ", TRIM_LEADING},
+ {" Google Video ", TRIM_TRAILING, " Google Video", TRIM_TRAILING},
+ {" Google Video ", TRIM_ALL, "Google Video", TRIM_ALL},
+ {"Google Video", TRIM_ALL, "Google Video", TRIM_NONE},
+ {"", TRIM_ALL, "", TRIM_NONE},
+ {" ", TRIM_LEADING, "", TRIM_LEADING},
+ {" ", TRIM_TRAILING, "", TRIM_TRAILING},
+ {" ", TRIM_ALL, "", TRIM_ALL},
+ {"\t\rTest String\n", TRIM_ALL, "Test String", TRIM_ALL},
+};
+
+namespace {
+
+// Helper used to test TruncateUTF8ToByteSize.
+bool Truncated(const std::string& input,
+ const size_t byte_size,
+ std::string* output) {
+ size_t prev = input.length();
+ TruncateUTF8ToByteSize(input, byte_size, output);
+ return prev != output->length();
+}
+
+} // namespace
+
+TEST(StringUtilTest, TruncateUTF8ToByteSize) {
+ std::string output;
+
+ // Empty strings and invalid byte_size arguments
+ EXPECT_FALSE(Truncated(std::string(), 0, &output));
+ EXPECT_EQ(output, "");
+ EXPECT_TRUE(Truncated("\xe1\x80\xbf", 0, &output));
+ EXPECT_EQ(output, "");
+ EXPECT_FALSE(Truncated("\xe1\x80\xbf", static_cast<size_t>(-1), &output));
+ EXPECT_FALSE(Truncated("\xe1\x80\xbf", 4, &output));
+
+ // Testing the truncation of valid UTF8 correctly
+ EXPECT_TRUE(Truncated("abc", 2, &output));
+ EXPECT_EQ(output, "ab");
+ EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 2, &output));
+ EXPECT_EQ(output.compare("\xc2\x81"), 0);
+ EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 3, &output));
+ EXPECT_EQ(output.compare("\xc2\x81"), 0);
+ EXPECT_FALSE(Truncated("\xc2\x81\xc2\x81", 4, &output));
+ EXPECT_EQ(output.compare("\xc2\x81\xc2\x81"), 0);
+
+ {
+ const char array[] = "\x00\x00\xc2\x81\xc2\x81";
+ const std::string array_string(array, gurl_base::size(array));
+ EXPECT_TRUE(Truncated(array_string, 4, &output));
+ EXPECT_EQ(output.compare(std::string("\x00\x00\xc2\x81", 4)), 0);
+ }
+
+ {
+ const char array[] = "\x00\xc2\x81\xc2\x81";
+ const std::string array_string(array, gurl_base::size(array));
+ EXPECT_TRUE(Truncated(array_string, 4, &output));
+ EXPECT_EQ(output.compare(std::string("\x00\xc2\x81", 3)), 0);
+ }
+
+ // Testing invalid UTF8
+ EXPECT_TRUE(Truncated("\xed\xa0\x80\xed\xbf\xbf", 6, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xed\xa0\x8f", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xed\xbf\xbf", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ // Testing invalid UTF8 mixed with valid UTF8
+ EXPECT_FALSE(Truncated("\xe1\x80\xbf", 3, &output));
+ EXPECT_EQ(output.compare("\xe1\x80\xbf"), 0);
+ EXPECT_FALSE(Truncated("\xf1\x80\xa0\xbf", 4, &output));
+ EXPECT_EQ(output.compare("\xf1\x80\xa0\xbf"), 0);
+ EXPECT_FALSE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf",
+ 10, &output));
+ EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"), 0);
+ EXPECT_TRUE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1""a""\x80\xa0",
+ 10, &output));
+ EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1""a"), 0);
+ EXPECT_FALSE(Truncated("\xef\xbb\xbf" "abc", 6, &output));
+ EXPECT_EQ(output.compare("\xef\xbb\xbf" "abc"), 0);
+
+ // Overlong sequences
+ EXPECT_TRUE(Truncated("\xc0\x80", 2, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xc1\x80\xc1\x81", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xe0\x80\x80", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xe0\x82\x80", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xe0\x9f\xbf", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf0\x80\x80\x8D", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf0\x80\x82\x91", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf0\x80\xa0\x80", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf0\x8f\xbb\xbf", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf8\x80\x80\x80\xbf", 5, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xfc\x80\x80\x80\xa0\xa5", 6, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ // Beyond U+10FFFF (the upper limit of Unicode codespace)
+ EXPECT_TRUE(Truncated("\xf4\x90\x80\x80", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf8\xa0\xbf\x80\xbf", 5, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xfc\x9c\xbf\x80\xbf\x80", 6, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
+ EXPECT_TRUE(Truncated("\xfe\xff", 2, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xff\xfe", 2, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ {
+ const char array[] = "\x00\x00\xfe\xff";
+ const std::string array_string(array, gurl_base::size(array));
+ EXPECT_TRUE(Truncated(array_string, 4, &output));
+ EXPECT_EQ(output.compare(std::string("\x00\x00", 2)), 0);
+ }
+
+ // Variants on the previous test
+ {
+ const char array[] = "\xff\xfe\x00\x00";
+ const std::string array_string(array, 4);
+ EXPECT_FALSE(Truncated(array_string, 4, &output));
+ EXPECT_EQ(output.compare(std::string("\xff\xfe\x00\x00", 4)), 0);
+ }
+ {
+ const char array[] = "\xff\x00\x00\xfe";
+ const std::string array_string(array, gurl_base::size(array));
+ EXPECT_TRUE(Truncated(array_string, 4, &output));
+ EXPECT_EQ(output.compare(std::string("\xff\x00\x00", 3)), 0);
+ }
+
+ // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
+ EXPECT_TRUE(Truncated("\xef\xbf\xbe", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf0\x8f\xbf\xbe", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xf3\xbf\xbf\xbf", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xef\xb7\x90", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_TRUE(Truncated("\xef\xb7\xaf", 3, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ // Strings in legacy encodings that are valid in UTF-8, but
+ // are invalid as UTF-8 in real data.
+ EXPECT_TRUE(Truncated("caf\xe9", 4, &output));
+ EXPECT_EQ(output.compare("caf"), 0);
+ EXPECT_TRUE(Truncated("\xb0\xa1\xb0\xa2", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+ EXPECT_FALSE(Truncated("\xa7\x41\xa6\x6e", 4, &output));
+ EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0);
+ EXPECT_TRUE(Truncated("\xa7\x41\xa6\x6e\xd9\xee\xe4\xee", 7,
+ &output));
+ EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0);
+
+ // Testing using the same string as input and output.
+ EXPECT_FALSE(Truncated(output, 4, &output));
+ EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0);
+ EXPECT_TRUE(Truncated(output, 3, &output));
+ EXPECT_EQ(output.compare("\xa7\x41"), 0);
+
+ // "abc" with U+201[CD] in windows-125[0-8]
+ EXPECT_TRUE(Truncated("\x93" "abc\x94", 5, &output));
+ EXPECT_EQ(output.compare("\x93" "abc"), 0);
+
+ // U+0639 U+064E U+0644 U+064E in ISO-8859-6
+ EXPECT_TRUE(Truncated("\xd9\xee\xe4\xee", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+
+ // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
+ EXPECT_TRUE(Truncated("\xe3\xe5\xe9\xdC", 4, &output));
+ EXPECT_EQ(output.compare(""), 0);
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+TEST(StringUtilTest, as_wcstr) {
+ char16 rw_buffer[10] = {};
+ static_assert(
+ std::is_same<wchar_t*, decltype(as_writable_wcstr(rw_buffer))>::value,
+ "");
+ EXPECT_EQ(static_cast<void*>(rw_buffer), as_writable_wcstr(rw_buffer));
+
+ string16 rw_str(10, '\0');
+ static_assert(
+ std::is_same<wchar_t*, decltype(as_writable_wcstr(rw_str))>::value, "");
+ EXPECT_EQ(static_cast<const void*>(rw_str.data()), as_writable_wcstr(rw_str));
+
+ const char16 ro_buffer[10] = {};
+ static_assert(
+ std::is_same<const wchar_t*, decltype(as_wcstr(ro_buffer))>::value, "");
+ EXPECT_EQ(static_cast<const void*>(ro_buffer), as_wcstr(ro_buffer));
+
+ const string16 ro_str(10, '\0');
+ static_assert(std::is_same<const wchar_t*, decltype(as_wcstr(ro_str))>::value,
+ "");
+ EXPECT_EQ(static_cast<const void*>(ro_str.data()), as_wcstr(ro_str));
+
+ StringPiece16 piece = ro_buffer;
+ static_assert(std::is_same<const wchar_t*, decltype(as_wcstr(piece))>::value,
+ "");
+ EXPECT_EQ(static_cast<const void*>(piece.data()), as_wcstr(piece));
+}
+
+TEST(StringUtilTest, as_u16cstr) {
+ wchar_t rw_buffer[10] = {};
+ static_assert(
+ std::is_same<char16*, decltype(as_writable_u16cstr(rw_buffer))>::value,
+ "");
+ EXPECT_EQ(static_cast<void*>(rw_buffer), as_writable_u16cstr(rw_buffer));
+
+ std::wstring rw_str(10, '\0');
+ static_assert(
+ std::is_same<char16*, decltype(as_writable_u16cstr(rw_str))>::value, "");
+ EXPECT_EQ(static_cast<const void*>(rw_str.data()),
+ as_writable_u16cstr(rw_str));
+
+ const wchar_t ro_buffer[10] = {};
+ static_assert(
+ std::is_same<const char16*, decltype(as_u16cstr(ro_buffer))>::value, "");
+ EXPECT_EQ(static_cast<const void*>(ro_buffer), as_u16cstr(ro_buffer));
+
+ const std::wstring ro_str(10, '\0');
+ static_assert(
+ std::is_same<const char16*, decltype(as_u16cstr(ro_str))>::value, "");
+ EXPECT_EQ(static_cast<const void*>(ro_str.data()), as_u16cstr(ro_str));
+
+ WStringPiece piece = ro_buffer;
+ static_assert(std::is_same<const char16*, decltype(as_u16cstr(piece))>::value,
+ "");
+ EXPECT_EQ(static_cast<const void*>(piece.data()), as_u16cstr(piece));
+}
+#endif // defined(WCHAR_T_IS_UTF16)
+
+TEST(StringUtilTest, TrimWhitespace) {
+ string16 output; // Allow contents to carry over to next testcase
+ for (const auto& value : trim_cases) {
+ EXPECT_EQ(value.return_value,
+ TrimWhitespace(WideToUTF16(value.input), value.positions,
+ &output));
+ EXPECT_EQ(WideToUTF16(value.output), output);
+ }
+
+ // Test that TrimWhitespace() can take the same string for input and output
+ output = ASCIIToUTF16(" This is a test \r\n");
+ EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output));
+ EXPECT_EQ(ASCIIToUTF16("This is a test"), output);
+
+ // Once more, but with a string of whitespace
+ output = ASCIIToUTF16(" \r\n");
+ EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output));
+ EXPECT_EQ(string16(), output);
+
+ std::string output_ascii;
+ for (const auto& value : trim_cases_ascii) {
+ EXPECT_EQ(value.return_value,
+ TrimWhitespaceASCII(value.input, value.positions, &output_ascii));
+ EXPECT_EQ(value.output, output_ascii);
+ }
+}
+
+static const struct collapse_case {
+ const wchar_t* input;
+ const bool trim;
+ const wchar_t* output;
+} collapse_cases[] = {
+ {L" Google Video ", false, L"Google Video"},
+ {L"Google Video", false, L"Google Video"},
+ {L"", false, L""},
+ {L" ", false, L""},
+ {L"\t\rTest String\n", false, L"Test String"},
+ {L"\x2002Test String\x00A0\x3000", false, L"Test String"},
+ {L" Test \n \t String ", false, L"Test String"},
+ {L"\x2002Test\x1680 \x2028 \tString\x00A0\x3000", false, L"Test String"},
+ {L" Test String", false, L"Test String"},
+ {L"Test String ", false, L"Test String"},
+ {L"Test String", false, L"Test String"},
+ {L"", true, L""},
+ {L"\n", true, L""},
+ {L" \r ", true, L""},
+ {L"\nFoo", true, L"Foo"},
+ {L"\r Foo ", true, L"Foo"},
+ {L" Foo bar ", true, L"Foo bar"},
+ {L" \tFoo bar \n", true, L"Foo bar"},
+ {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"},
+};
+
+TEST(StringUtilTest, CollapseWhitespace) {
+ for (const auto& value : collapse_cases) {
+ EXPECT_EQ(WideToUTF16(value.output),
+ CollapseWhitespace(WideToUTF16(value.input), value.trim));
+ }
+}
+
+static const struct collapse_case_ascii {
+ const char* input;
+ const bool trim;
+ const char* output;
+} collapse_cases_ascii[] = {
+ {" Google Video ", false, "Google Video"},
+ {"Google Video", false, "Google Video"},
+ {"", false, ""},
+ {" ", false, ""},
+ {"\t\rTest String\n", false, "Test String"},
+ {" Test \n \t String ", false, "Test String"},
+ {" Test String", false, "Test String"},
+ {"Test String ", false, "Test String"},
+ {"Test String", false, "Test String"},
+ {"", true, ""},
+ {"\n", true, ""},
+ {" \r ", true, ""},
+ {"\nFoo", true, "Foo"},
+ {"\r Foo ", true, "Foo"},
+ {" Foo bar ", true, "Foo bar"},
+ {" \tFoo bar \n", true, "Foo bar"},
+ {" a \r b\n c \r\n d \t\re \t f \n ", true, "abcde f"},
+};
+
+TEST(StringUtilTest, CollapseWhitespaceASCII) {
+ for (const auto& value : collapse_cases_ascii) {
+ EXPECT_EQ(value.output, CollapseWhitespaceASCII(value.input, value.trim));
+ }
+}
+
+TEST(StringUtilTest, IsStringUTF8) {
+ EXPECT_TRUE(IsStringUTF8("abc"));
+ EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
+ EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
+ EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
+ EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
+ EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM
+
+ // surrogate code points
+ EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
+ EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));
+ EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));
+
+ // overlong sequences
+ EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000
+ EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"
+ EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000
+ EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080
+ EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff
+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D
+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091
+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800
+ EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)
+ EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F
+ EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5
+
+ // Beyond U+10FFFF (the upper limit of Unicode codespace)
+ EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000
+ EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes
+ EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes
+
+ // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
+ EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
+ EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
+ EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
+ EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));
+
+ // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
+ EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)
+ EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE
+ EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF
+ EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0
+ EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF
+ // Strings in legacy encodings. We can certainly make up strings
+ // in a legacy encoding that are valid in UTF-8, but in real data,
+ // most of them are invalid as UTF-8.
+ EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1
+ EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR
+ EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5
+ // "abc" with U+201[CD] in windows-125[0-8]
+ EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));
+ // U+0639 U+064E U+0644 U+064E in ISO-8859-6
+ EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));
+ // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
+ EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));
+
+ // Check that we support Embedded Nulls. The first uses the canonical UTF-8
+ // representation, and the second uses a 2-byte sequence. The second version
+ // is invalid UTF-8 since UTF-8 states that the shortest encoding for a
+ // given codepoint must be used.
+ static const char kEmbeddedNull[] = "embedded\0null";
+ EXPECT_TRUE(IsStringUTF8(
+ std::string(kEmbeddedNull, sizeof(kEmbeddedNull))));
+ EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000"));
+}
+
+TEST(StringUtilTest, IsStringASCII) {
+ static char char_ascii[] =
+ "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
+ static char16 char16_ascii[] = {
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A',
+ 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6',
+ '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0 };
+ static std::wstring wchar_ascii(
+ L"0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF");
+
+ // Test a variety of the fragment start positions and lengths in order to make
+ // sure that bit masking in IsStringASCII works correctly.
+ // Also, test that a non-ASCII character will be detected regardless of its
+ // position inside the string.
+ {
+ const size_t string_length = gurl_base::size(char_ascii) - 1;
+ for (size_t offset = 0; offset < 8; ++offset) {
+ for (size_t len = 0, max_len = string_length - offset; len < max_len;
+ ++len) {
+ EXPECT_TRUE(IsStringASCII(StringPiece(char_ascii + offset, len)));
+ for (size_t char_pos = offset; char_pos < len; ++char_pos) {
+ char_ascii[char_pos] |= '\x80';
+ EXPECT_FALSE(IsStringASCII(StringPiece(char_ascii + offset, len)));
+ char_ascii[char_pos] &= ~'\x80';
+ }
+ }
+ }
+ }
+
+ {
+ const size_t string_length = gurl_base::size(char16_ascii) - 1;
+ for (size_t offset = 0; offset < 4; ++offset) {
+ for (size_t len = 0, max_len = string_length - offset; len < max_len;
+ ++len) {
+ EXPECT_TRUE(IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+ for (size_t char_pos = offset; char_pos < len; ++char_pos) {
+ char16_ascii[char_pos] |= 0x80;
+ EXPECT_FALSE(
+ IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+ char16_ascii[char_pos] &= ~0x80;
+ // Also test when the upper half is non-zero.
+ char16_ascii[char_pos] |= 0x100;
+ EXPECT_FALSE(
+ IsStringASCII(StringPiece16(char16_ascii + offset, len)));
+ char16_ascii[char_pos] &= ~0x100;
+ }
+ }
+ }
+ }
+
+#if defined(WCHAR_T_IS_UTF32)
+ {
+ const size_t string_length = wchar_ascii.length();
+ for (size_t len = 0; len < string_length; ++len) {
+ EXPECT_TRUE(IsStringASCII(wchar_ascii.substr(0, len)));
+ for (size_t char_pos = 0; char_pos < len; ++char_pos) {
+ wchar_ascii[char_pos] |= 0x80;
+ EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len)));
+ wchar_ascii[char_pos] &= ~0x80;
+ wchar_ascii[char_pos] |= 0x100;
+ EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len)));
+ wchar_ascii[char_pos] &= ~0x100;
+ wchar_ascii[char_pos] |= 0x10000;
+ EXPECT_FALSE(IsStringASCII(wchar_ascii.substr(0, len)));
+ wchar_ascii[char_pos] &= ~0x10000;
+ }
+ }
+ }
+#endif // WCHAR_T_IS_UTF32
+}
+
+TEST(StringUtilTest, ConvertASCII) {
+ static const char* const char_cases[] = {
+ "Google Video",
+ "Hello, world\n",
+ "0123ABCDwxyz \a\b\t\r\n!+,.~"
+ };
+
+ static const wchar_t* const wchar_cases[] = {
+ L"Google Video",
+ L"Hello, world\n",
+ L"0123ABCDwxyz \a\b\t\r\n!+,.~"
+ };
+
+ for (size_t i = 0; i < gurl_base::size(char_cases); ++i) {
+ EXPECT_TRUE(IsStringASCII(char_cases[i]));
+ string16 utf16 = ASCIIToUTF16(char_cases[i]);
+ EXPECT_EQ(WideToUTF16(wchar_cases[i]), utf16);
+
+ std::string ascii = UTF16ToASCII(WideToUTF16(wchar_cases[i]));
+ EXPECT_EQ(char_cases[i], ascii);
+ }
+
+ EXPECT_FALSE(IsStringASCII("Google \x80Video"));
+
+ // Convert empty strings.
+ string16 empty16;
+ std::string empty;
+ EXPECT_EQ(empty, UTF16ToASCII(empty16));
+ EXPECT_EQ(empty16, ASCIIToUTF16(empty));
+
+ // Convert strings with an embedded NUL character.
+ const char chars_with_nul[] = "test\0string";
+ const int length_with_nul = gurl_base::size(chars_with_nul) - 1;
+ std::string string_with_nul(chars_with_nul, length_with_nul);
+ string16 string16_with_nul = ASCIIToUTF16(string_with_nul);
+ EXPECT_EQ(static_cast<string16::size_type>(length_with_nul),
+ string16_with_nul.length());
+ std::string narrow_with_nul = UTF16ToASCII(string16_with_nul);
+ EXPECT_EQ(static_cast<std::string::size_type>(length_with_nul),
+ narrow_with_nul.length());
+ EXPECT_EQ(0, string_with_nul.compare(narrow_with_nul));
+}
+
+TEST(StringUtilTest, ToLowerASCII) {
+ EXPECT_EQ('c', ToLowerASCII('C'));
+ EXPECT_EQ('c', ToLowerASCII('c'));
+ EXPECT_EQ('2', ToLowerASCII('2'));
+
+ EXPECT_EQ(static_cast<char16>('c'), ToLowerASCII(static_cast<char16>('C')));
+ EXPECT_EQ(static_cast<char16>('c'), ToLowerASCII(static_cast<char16>('c')));
+ EXPECT_EQ(static_cast<char16>('2'), ToLowerASCII(static_cast<char16>('2')));
+
+ EXPECT_EQ("cc2", ToLowerASCII("Cc2"));
+ EXPECT_EQ(ASCIIToUTF16("cc2"), ToLowerASCII(ASCIIToUTF16("Cc2")));
+}
+
+TEST(StringUtilTest, ToUpperASCII) {
+ EXPECT_EQ('C', ToUpperASCII('C'));
+ EXPECT_EQ('C', ToUpperASCII('c'));
+ EXPECT_EQ('2', ToUpperASCII('2'));
+
+ EXPECT_EQ(static_cast<char16>('C'), ToUpperASCII(static_cast<char16>('C')));
+ EXPECT_EQ(static_cast<char16>('C'), ToUpperASCII(static_cast<char16>('c')));
+ EXPECT_EQ(static_cast<char16>('2'), ToUpperASCII(static_cast<char16>('2')));
+
+ EXPECT_EQ("CC2", ToUpperASCII("Cc2"));
+ EXPECT_EQ(ASCIIToUTF16("CC2"), ToUpperASCII(ASCIIToUTF16("Cc2")));
+}
+
+TEST(StringUtilTest, LowerCaseEqualsASCII) {
+ static const struct {
+ const char* src_a;
+ const char* dst;
+ } lowercase_cases[] = {
+ { "FoO", "foo" },
+ { "foo", "foo" },
+ { "FOO", "foo" },
+ };
+
+ for (const auto& i : lowercase_cases) {
+ EXPECT_TRUE(LowerCaseEqualsASCII(ASCIIToUTF16(i.src_a), i.dst));
+ EXPECT_TRUE(LowerCaseEqualsASCII(i.src_a, i.dst));
+ }
+}
+
+TEST(StringUtilTest, FormatBytesUnlocalized) {
+ static const struct {
+ int64_t bytes;
+ const char* expected;
+ } cases[] = {
+ // Expected behavior: we show one post-decimal digit when we have
+ // under two pre-decimal digits, except in cases where it makes no
+ // sense (zero or bytes).
+ // Since we switch units once we cross the 1000 mark, this keeps
+ // the display of file sizes or bytes consistently around three
+ // digits.
+ {0, "0 B"},
+ {512, "512 B"},
+ {1024*1024, "1.0 MB"},
+ {1024*1024*1024, "1.0 GB"},
+ {10LL*1024*1024*1024, "10.0 GB"},
+ {99LL*1024*1024*1024, "99.0 GB"},
+ {105LL*1024*1024*1024, "105 GB"},
+ {105LL*1024*1024*1024 + 500LL*1024*1024, "105 GB"},
+ {~(1LL << 63), "8192 PB"},
+
+ {99*1024 + 103, "99.1 kB"},
+ {1024*1024 + 103, "1.0 MB"},
+ {1024*1024 + 205 * 1024, "1.2 MB"},
+ {1024*1024*1024 + (927 * 1024*1024), "1.9 GB"},
+ {10LL*1024*1024*1024, "10.0 GB"},
+ {100LL*1024*1024*1024, "100 GB"},
+ };
+
+ for (const auto& i : cases) {
+ EXPECT_EQ(ASCIIToUTF16(i.expected), FormatBytesUnlocalized(i.bytes));
+ }
+}
+TEST(StringUtilTest, ReplaceSubstringsAfterOffset) {
+ static const struct {
+ StringPiece str;
+ size_t start_offset;
+ StringPiece find_this;
+ StringPiece replace_with;
+ StringPiece expected;
+ } cases[] = {
+ {"aaa", 0, "", "b", "aaa"},
+ {"aaa", 1, "", "b", "aaa"},
+ {"aaa", 0, "a", "b", "bbb"},
+ {"aaa", 0, "aa", "b", "ba"},
+ {"aaa", 0, "aa", "bbb", "bbba"},
+ {"aaaaa", 0, "aa", "b", "bba"},
+ {"ababaaababa", 0, "aba", "", "baaba"},
+ {"ababaaababa", 0, "aba", "_", "_baa_ba"},
+ {"ababaaababa", 0, "aba", "__", "__baa__ba"},
+ {"ababaaababa", 0, "aba", "___", "___baa___ba"},
+ {"ababaaababa", 0, "aba", "____", "____baa____ba"},
+ {"ababaaababa", 0, "aba", "_____", "_____baa_____ba"},
+ {"abb", 0, "ab", "a", "ab"},
+ {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "},
+ {"Not found", 0, "x", "0", "Not found"},
+ {"Not found again", 5, "x", "0", "Not found again"},
+ {" Making it much longer ", 0, " ", "Four score and seven years ago",
+ "Four score and seven years agoMakingFour score and seven years agoit"
+ "Four score and seven years agomuchFour score and seven years agolonger"
+ "Four score and seven years ago"},
+ {" Making it much much much much shorter ", 0,
+ "Making it much much much much shorter", "", " "},
+ {"so much much much much much very much much much shorter", 0, "much ",
+ "", "so very shorter"},
+ {"Invalid offset", 9999, "t", "foobar", "Invalid offset"},
+ {"Replace me only me once", 9, "me ", "", "Replace me only once"},
+ {"abababab", 2, "ab", "c", "abccc"},
+ {"abababab", 1, "ab", "c", "abccc"},
+ {"abababab", 1, "aba", "c", "abcbab"},
+ };
+
+ // gurl_base::string16 variant
+ for (const auto& scenario : cases) {
+ string16 str = ASCIIToUTF16(scenario.str);
+ ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
+ ASCIIToUTF16(scenario.find_this),
+ ASCIIToUTF16(scenario.replace_with));
+ EXPECT_EQ(ASCIIToUTF16(scenario.expected), str);
+ }
+
+ // std::string with insufficient capacity: expansion must realloc the buffer.
+ for (const auto& scenario : cases) {
+ std::string str = scenario.str.as_string();
+ str.shrink_to_fit(); // This is nonbinding, but it's the best we've got.
+ ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
+ scenario.find_this, scenario.replace_with);
+ EXPECT_EQ(scenario.expected, str);
+ }
+
+ // std::string with ample capacity: should be possible to grow in-place.
+ for (const auto& scenario : cases) {
+ std::string str = scenario.str.as_string();
+ str.reserve(std::max(scenario.str.length(), scenario.expected.length()) *
+ 2);
+
+ ReplaceSubstringsAfterOffset(&str, scenario.start_offset,
+ scenario.find_this, scenario.replace_with);
+ EXPECT_EQ(scenario.expected, str);
+ }
+}
+
+TEST(StringUtilTest, ReplaceFirstSubstringAfterOffset) {
+ static const struct {
+ const char* str;
+ string16::size_type start_offset;
+ const char* find_this;
+ const char* replace_with;
+ const char* expected;
+ } cases[] = {
+ {"aaa", 0, "a", "b", "baa"},
+ {"abb", 0, "ab", "a", "ab"},
+ {"Removing some substrings inging", 0, "ing", "",
+ "Remov some substrings inging"},
+ {"Not found", 0, "x", "0", "Not found"},
+ {"Not found again", 5, "x", "0", "Not found again"},
+ {" Making it much longer ", 0, " ", "Four score and seven years ago",
+ "Four score and seven years agoMaking it much longer "},
+ {"Invalid offset", 9999, "t", "foobar", "Invalid offset"},
+ {"Replace me only me once", 4, "me ", "", "Replace only me once"},
+ {"abababab", 2, "ab", "c", "abcabab"},
+ };
+
+ for (const auto& i : cases) {
+ string16 str = ASCIIToUTF16(i.str);
+ ReplaceFirstSubstringAfterOffset(&str, i.start_offset,
+ ASCIIToUTF16(i.find_this),
+ ASCIIToUTF16(i.replace_with));
+ EXPECT_EQ(ASCIIToUTF16(i.expected), str);
+ }
+}
+
+TEST(StringUtilTest, HexDigitToInt) {
+ EXPECT_EQ(0, HexDigitToInt('0'));
+ EXPECT_EQ(1, HexDigitToInt('1'));
+ EXPECT_EQ(2, HexDigitToInt('2'));
+ EXPECT_EQ(3, HexDigitToInt('3'));
+ EXPECT_EQ(4, HexDigitToInt('4'));
+ EXPECT_EQ(5, HexDigitToInt('5'));
+ EXPECT_EQ(6, HexDigitToInt('6'));
+ EXPECT_EQ(7, HexDigitToInt('7'));
+ EXPECT_EQ(8, HexDigitToInt('8'));
+ EXPECT_EQ(9, HexDigitToInt('9'));
+ EXPECT_EQ(10, HexDigitToInt('A'));
+ EXPECT_EQ(11, HexDigitToInt('B'));
+ EXPECT_EQ(12, HexDigitToInt('C'));
+ EXPECT_EQ(13, HexDigitToInt('D'));
+ EXPECT_EQ(14, HexDigitToInt('E'));
+ EXPECT_EQ(15, HexDigitToInt('F'));
+
+ // Verify the lower case as well.
+ EXPECT_EQ(10, HexDigitToInt('a'));
+ EXPECT_EQ(11, HexDigitToInt('b'));
+ EXPECT_EQ(12, HexDigitToInt('c'));
+ EXPECT_EQ(13, HexDigitToInt('d'));
+ EXPECT_EQ(14, HexDigitToInt('e'));
+ EXPECT_EQ(15, HexDigitToInt('f'));
+}
+
+TEST(StringUtilTest, JoinString) {
+ std::string separator(", ");
+ std::vector<std::string> parts;
+ EXPECT_EQ(std::string(), JoinString(parts, separator));
+
+ parts.push_back(std::string());
+ EXPECT_EQ(std::string(), JoinString(parts, separator));
+ parts.clear();
+
+ parts.push_back("a");
+ EXPECT_EQ("a", JoinString(parts, separator));
+
+ parts.push_back("b");
+ parts.push_back("c");
+ EXPECT_EQ("a, b, c", JoinString(parts, separator));
+
+ parts.push_back(std::string());
+ EXPECT_EQ("a, b, c, ", JoinString(parts, separator));
+ parts.push_back(" ");
+ EXPECT_EQ("a|b|c|| ", JoinString(parts, "|"));
+}
+
+TEST(StringUtilTest, JoinString16) {
+ string16 separator = ASCIIToUTF16(", ");
+ std::vector<string16> parts;
+ EXPECT_EQ(string16(), JoinString(parts, separator));
+
+ parts.push_back(string16());
+ EXPECT_EQ(string16(), JoinString(parts, separator));
+ parts.clear();
+
+ parts.push_back(ASCIIToUTF16("a"));
+ EXPECT_EQ(ASCIIToUTF16("a"), JoinString(parts, separator));
+
+ parts.push_back(ASCIIToUTF16("b"));
+ parts.push_back(ASCIIToUTF16("c"));
+ EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString(parts, separator));
+
+ parts.push_back(ASCIIToUTF16(""));
+ EXPECT_EQ(ASCIIToUTF16("a, b, c, "), JoinString(parts, separator));
+ parts.push_back(ASCIIToUTF16(" "));
+ EXPECT_EQ(ASCIIToUTF16("a|b|c|| "), JoinString(parts, ASCIIToUTF16("|")));
+}
+
+TEST(StringUtilTest, JoinStringPiece) {
+ std::string separator(", ");
+ std::vector<StringPiece> parts;
+ EXPECT_EQ(std::string(), JoinString(parts, separator));
+
+ // Test empty first part (https://crbug.com/698073).
+ parts.push_back(StringPiece());
+ EXPECT_EQ(std::string(), JoinString(parts, separator));
+ parts.clear();
+
+ parts.push_back("a");
+ EXPECT_EQ("a", JoinString(parts, separator));
+
+ parts.push_back("b");
+ parts.push_back("c");
+ EXPECT_EQ("a, b, c", JoinString(parts, separator));
+
+ parts.push_back(StringPiece());
+ EXPECT_EQ("a, b, c, ", JoinString(parts, separator));
+ parts.push_back(" ");
+ EXPECT_EQ("a|b|c|| ", JoinString(parts, "|"));
+}
+
+TEST(StringUtilTest, JoinStringPiece16) {
+ string16 separator = ASCIIToUTF16(", ");
+ std::vector<StringPiece16> parts;
+ EXPECT_EQ(string16(), JoinString(parts, separator));
+
+ // Test empty first part (https://crbug.com/698073).
+ parts.push_back(StringPiece16());
+ EXPECT_EQ(string16(), JoinString(parts, separator));
+ parts.clear();
+
+ const string16 kA = ASCIIToUTF16("a");
+ parts.push_back(kA);
+ EXPECT_EQ(ASCIIToUTF16("a"), JoinString(parts, separator));
+
+ const string16 kB = ASCIIToUTF16("b");
+ parts.push_back(kB);
+ const string16 kC = ASCIIToUTF16("c");
+ parts.push_back(kC);
+ EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString(parts, separator));
+
+ parts.push_back(StringPiece16());
+ EXPECT_EQ(ASCIIToUTF16("a, b, c, "), JoinString(parts, separator));
+ const string16 kSpace = ASCIIToUTF16(" ");
+ parts.push_back(kSpace);
+ EXPECT_EQ(ASCIIToUTF16("a|b|c|| "), JoinString(parts, ASCIIToUTF16("|")));
+}
+
+TEST(StringUtilTest, JoinStringInitializerList) {
+ std::string separator(", ");
+ EXPECT_EQ(std::string(), JoinString({}, separator));
+
+ // Test empty first part (https://crbug.com/698073).
+ EXPECT_EQ(std::string(), JoinString({StringPiece()}, separator));
+
+ // With const char*s.
+ EXPECT_EQ("a", JoinString({"a"}, separator));
+ EXPECT_EQ("a, b, c", JoinString({"a", "b", "c"}, separator));
+ EXPECT_EQ("a, b, c, ", JoinString({"a", "b", "c", StringPiece()}, separator));
+ EXPECT_EQ("a|b|c|| ", JoinString({"a", "b", "c", StringPiece(), " "}, "|"));
+
+ // With std::strings.
+ const std::string kA = "a";
+ const std::string kB = "b";
+ EXPECT_EQ("a, b", JoinString({kA, kB}, separator));
+
+ // With StringPieces.
+ const StringPiece kPieceA = kA;
+ const StringPiece kPieceB = kB;
+ EXPECT_EQ("a, b", JoinString({kPieceA, kPieceB}, separator));
+}
+
+TEST(StringUtilTest, JoinStringInitializerList16) {
+ string16 separator = ASCIIToUTF16(", ");
+ EXPECT_EQ(string16(), JoinString({}, separator));
+
+ // Test empty first part (https://crbug.com/698073).
+ EXPECT_EQ(string16(), JoinString({StringPiece16()}, separator));
+
+ // With string16s.
+ const string16 kA = ASCIIToUTF16("a");
+ EXPECT_EQ(ASCIIToUTF16("a"), JoinString({kA}, separator));
+
+ const string16 kB = ASCIIToUTF16("b");
+ const string16 kC = ASCIIToUTF16("c");
+ EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString({kA, kB, kC}, separator));
+
+ EXPECT_EQ(ASCIIToUTF16("a, b, c, "),
+ JoinString({kA, kB, kC, StringPiece16()}, separator));
+ const string16 kSpace = ASCIIToUTF16(" ");
+ EXPECT_EQ(
+ ASCIIToUTF16("a|b|c|| "),
+ JoinString({kA, kB, kC, StringPiece16(), kSpace}, ASCIIToUTF16("|")));
+
+ // With StringPiece16s.
+ const StringPiece16 kPieceA = kA;
+ const StringPiece16 kPieceB = kB;
+ EXPECT_EQ(ASCIIToUTF16("a, b"), JoinString({kPieceA, kPieceB}, separator));
+}
+
+TEST(StringUtilTest, StartsWith) {
+ EXPECT_TRUE(StartsWith("javascript:url", "javascript",
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(StartsWith("JavaScript:url", "javascript",
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(StartsWith("javascript:url", "javascript",
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(StartsWith("JavaScript:url", "javascript",
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith("java", "javascript", gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(StartsWith("java", "javascript",
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith(std::string(), "javascript",
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith(std::string(), "javascript",
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(StartsWith("java", std::string(),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(StartsWith("java", std::string(), gurl_base::CompareCase::SENSITIVE));
+
+ EXPECT_TRUE(StartsWith(ASCIIToUTF16("javascript:url"),
+ ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(StartsWith(ASCIIToUTF16("JavaScript:url"),
+ ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(StartsWith(ASCIIToUTF16("javascript:url"),
+ ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(StartsWith(ASCIIToUTF16("JavaScript:url"),
+ ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith(ASCIIToUTF16("java"), ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(StartsWith(ASCIIToUTF16("java"), ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith(string16(), ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(StartsWith(string16(), ASCIIToUTF16("javascript"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(StartsWith(ASCIIToUTF16("java"), string16(),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(StartsWith(ASCIIToUTF16("java"), string16(),
+ gurl_base::CompareCase::SENSITIVE));
+}
+
+TEST(StringUtilTest, EndsWith) {
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.Plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.Plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(EndsWith(ASCIIToUTF16(".plug"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(EndsWith(ASCIIToUTF16(".plug"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.plugin Bar"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_FALSE(EndsWith(ASCIIToUTF16("Foo.plugin Bar"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(EndsWith(string16(), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_FALSE(EndsWith(string16(), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), string16(),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16("Foo.plugin"), string16(),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16(".plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(EndsWith(ASCIIToUTF16(".plugin"), ASCIIToUTF16(".plugin"),
+ gurl_base::CompareCase::SENSITIVE));
+ EXPECT_TRUE(
+ EndsWith(string16(), string16(), gurl_base::CompareCase::INSENSITIVE_ASCII));
+ EXPECT_TRUE(EndsWith(string16(), string16(), gurl_base::CompareCase::SENSITIVE));
+}
+
+TEST(StringUtilTest, GetStringFWithOffsets) {
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("1"));
+ subst.push_back(ASCIIToUTF16("2"));
+ std::vector<size_t> offsets;
+
+ ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $1. Your number is $2."),
+ subst,
+ &offsets);
+ EXPECT_EQ(2U, offsets.size());
+ EXPECT_EQ(7U, offsets[0]);
+ EXPECT_EQ(25U, offsets[1]);
+ offsets.clear();
+
+ ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $2. Your number is $1."),
+ subst,
+ &offsets);
+ EXPECT_EQ(2U, offsets.size());
+ EXPECT_EQ(25U, offsets[0]);
+ EXPECT_EQ(7U, offsets[1]);
+ offsets.clear();
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersTooFew) {
+ // Test whether replacestringplaceholders works as expected when there
+ // are fewer inputs than outputs.
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("9a"));
+ subst.push_back(ASCIIToUTF16("8b"));
+ subst.push_back(ASCIIToUTF16("7c"));
+
+ string16 formatted =
+ ReplaceStringPlaceholders(
+ ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$1g,$2h,$3i"), subst, nullptr);
+
+ EXPECT_EQ(ASCIIToUTF16("9aa,8bb,7cc,d,e,f,9ag,8bh,7ci"), formatted);
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholders) {
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("9a"));
+ subst.push_back(ASCIIToUTF16("8b"));
+ subst.push_back(ASCIIToUTF16("7c"));
+ subst.push_back(ASCIIToUTF16("6d"));
+ subst.push_back(ASCIIToUTF16("5e"));
+ subst.push_back(ASCIIToUTF16("4f"));
+ subst.push_back(ASCIIToUTF16("3g"));
+ subst.push_back(ASCIIToUTF16("2h"));
+ subst.push_back(ASCIIToUTF16("1i"));
+
+ string16 formatted =
+ ReplaceStringPlaceholders(
+ ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i"), subst, nullptr);
+
+ EXPECT_EQ(ASCIIToUTF16("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii"), formatted);
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersNetExpansionWithContraction) {
+ // In this test, some of the substitutions are shorter than the placeholders,
+ // but overall the string gets longer.
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("9a____"));
+ subst.push_back(ASCIIToUTF16("B"));
+ subst.push_back(ASCIIToUTF16("7c___"));
+ subst.push_back(ASCIIToUTF16("d"));
+ subst.push_back(ASCIIToUTF16("5e____"));
+ subst.push_back(ASCIIToUTF16("F"));
+ subst.push_back(ASCIIToUTF16("3g___"));
+ subst.push_back(ASCIIToUTF16("h"));
+ subst.push_back(ASCIIToUTF16("1i_____"));
+
+ string16 original = ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i");
+ string16 expected =
+ ASCIIToUTF16("9a____a,Bb,7c___c,dd,5e____e,Ff,3g___g,hh,1i_____i");
+
+ EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, nullptr));
+
+ std::vector<size_t> offsets;
+ EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, &offsets));
+ std::vector<size_t> expected_offsets = {0, 8, 11, 18, 21, 29, 32, 39, 42};
+ EXPECT_EQ(offsets.size(), subst.size());
+ EXPECT_EQ(expected_offsets, offsets);
+ for (size_t i = 0; i < offsets.size(); i++) {
+ EXPECT_EQ(expected.substr(expected_offsets[i], subst[i].length()),
+ subst[i]);
+ }
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersNetContractionWithExpansion) {
+ // In this test, some of the substitutions are longer than the placeholders,
+ // but overall the string gets smaller. Additionally, the placeholders appear
+ // in a permuted order.
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("z"));
+ subst.push_back(ASCIIToUTF16("y"));
+ subst.push_back(ASCIIToUTF16("XYZW"));
+ subst.push_back(ASCIIToUTF16("x"));
+ subst.push_back(ASCIIToUTF16("w"));
+
+ string16 formatted =
+ ReplaceStringPlaceholders(ASCIIToUTF16("$3_$4$2$1$5"), subst, nullptr);
+
+ EXPECT_EQ(ASCIIToUTF16("XYZW_xyzw"), formatted);
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersOneDigit) {
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("1a"));
+ string16 formatted =
+ ReplaceStringPlaceholders(ASCIIToUTF16(" $16 "), subst, nullptr);
+ EXPECT_EQ(ASCIIToUTF16(" 1a6 "), formatted);
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersInvalidPlaceholder) {
+ std::vector<string16> subst;
+ subst.push_back(ASCIIToUTF16("1a"));
+ string16 formatted =
+ ReplaceStringPlaceholders(ASCIIToUTF16("+$-+$A+$1+"), subst, nullptr);
+ EXPECT_EQ(ASCIIToUTF16("+++1a+"), formatted);
+}
+
+TEST(StringUtilTest, StdStringReplaceStringPlaceholders) {
+ std::vector<std::string> subst;
+ subst.push_back("9a");
+ subst.push_back("8b");
+ subst.push_back("7c");
+ subst.push_back("6d");
+ subst.push_back("5e");
+ subst.push_back("4f");
+ subst.push_back("3g");
+ subst.push_back("2h");
+ subst.push_back("1i");
+
+ std::string formatted =
+ ReplaceStringPlaceholders(
+ "$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i", subst, nullptr);
+
+ EXPECT_EQ("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii", formatted);
+}
+
+TEST(StringUtilTest, StdStringReplaceStringPlaceholdersMultipleMatches) {
+ std::vector<std::string> subst;
+ subst.push_back("4"); // Referenced twice.
+ subst.push_back("?"); // Unreferenced.
+ subst.push_back("!"); // Unreferenced.
+ subst.push_back("16"); // Referenced once.
+
+ std::string original = "$1 * $1 == $4";
+ std::string expected = "4 * 4 == 16";
+ EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, nullptr));
+ std::vector<size_t> offsets;
+ EXPECT_EQ(expected, ReplaceStringPlaceholders(original, subst, &offsets));
+ std::vector<size_t> expected_offsets = {0, 4, 9};
+ EXPECT_EQ(expected_offsets, offsets);
+}
+
+TEST(StringUtilTest, ReplaceStringPlaceholdersConsecutiveDollarSigns) {
+ std::vector<std::string> subst;
+ subst.push_back("a");
+ subst.push_back("b");
+ subst.push_back("c");
+ EXPECT_EQ(ReplaceStringPlaceholders("$$1 $$$2 $$$$3", subst, nullptr),
+ "$1 $$2 $$$3");
+}
+
+TEST(StringUtilTest, LcpyTest) {
+ // Test the normal case where we fit in our buffer.
+ {
+ char dst[10];
+ wchar_t wdst[10];
+ EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst)));
+ EXPECT_EQ(0, memcmp(dst, "abcdefg", 8));
+ EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst)));
+ EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8));
+ }
+
+ // Test dst_size == 0, nothing should be written to |dst| and we should
+ // have the equivalent of strlen(src).
+ {
+ char dst[2] = {1, 2};
+ wchar_t wdst[2] = {1, 2};
+ EXPECT_EQ(7U, strlcpy(dst, "abcdefg", 0));
+ EXPECT_EQ(1, dst[0]);
+ EXPECT_EQ(2, dst[1]);
+ EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", 0));
+ EXPECT_EQ(static_cast<wchar_t>(1), wdst[0]);
+ EXPECT_EQ(static_cast<wchar_t>(2), wdst[1]);
+ }
+
+ // Test the case were we _just_ competely fit including the null.
+ {
+ char dst[8];
+ wchar_t wdst[8];
+ EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst)));
+ EXPECT_EQ(0, memcmp(dst, "abcdefg", 8));
+ EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst)));
+ EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8));
+ }
+
+ // Test the case were we we are one smaller, so we can't fit the null.
+ {
+ char dst[7];
+ wchar_t wdst[7];
+ EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst)));
+ EXPECT_EQ(0, memcmp(dst, "abcdef", 7));
+ EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst)));
+ EXPECT_EQ(0, memcmp(wdst, L"abcdef", sizeof(wchar_t) * 7));
+ }
+
+ // Test the case were we are just too small.
+ {
+ char dst[3];
+ wchar_t wdst[3];
+ EXPECT_EQ(7U, strlcpy(dst, "abcdefg", gurl_base::size(dst)));
+ EXPECT_EQ(0, memcmp(dst, "ab", 3));
+ EXPECT_EQ(7U, wcslcpy(wdst, L"abcdefg", gurl_base::size(wdst)));
+ EXPECT_EQ(0, memcmp(wdst, L"ab", sizeof(wchar_t) * 3));
+ }
+}
+
+TEST(StringUtilTest, WprintfFormatPortabilityTest) {
+ static const struct {
+ const wchar_t* input;
+ bool portable;
+ } cases[] = {
+ { L"%ls", true },
+ { L"%s", false },
+ { L"%S", false },
+ { L"%lS", false },
+ { L"Hello, %s", false },
+ { L"%lc", true },
+ { L"%c", false },
+ { L"%C", false },
+ { L"%lC", false },
+ { L"%ls %s", false },
+ { L"%s %ls", false },
+ { L"%s %ls %s", false },
+ { L"%f", true },
+ { L"%f %F", false },
+ { L"%d %D", false },
+ { L"%o %O", false },
+ { L"%u %U", false },
+ { L"%f %d %o %u", true },
+ { L"%-8d (%02.1f%)", true },
+ { L"% 10s", false },
+ { L"% 10ls", true }
+ };
+ for (const auto& i : cases)
+ EXPECT_EQ(i.portable, IsWprintfFormatPortable(i.input));
+}
+
+TEST(StringUtilTest, RemoveChars) {
+ const char kRemoveChars[] = "-/+*";
+ std::string input = "A-+bc/d!*";
+ EXPECT_TRUE(RemoveChars(input, kRemoveChars, &input));
+ EXPECT_EQ("Abcd!", input);
+
+ // No characters match kRemoveChars.
+ EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input));
+ EXPECT_EQ("Abcd!", input);
+
+ // Empty string.
+ input.clear();
+ EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input));
+ EXPECT_EQ(std::string(), input);
+}
+
+TEST(StringUtilTest, ReplaceChars) {
+ struct TestData {
+ const char* input;
+ const char* replace_chars;
+ const char* replace_with;
+ const char* output;
+ bool result;
+ } cases[] = {
+ {"", "", "", "", false},
+ {"t", "t", "t", "t", true},
+ {"a", "b", "c", "a", false},
+ {"b", "b", "c", "c", true},
+ {"bob", "b", "p", "pop", true},
+ {"bob", "o", "i", "bib", true},
+ {"test", "", "", "test", false},
+ {"test", "", "!", "test", false},
+ {"test", "z", "!", "test", false},
+ {"test", "e", "!", "t!st", true},
+ {"test", "e", "!?", "t!?st", true},
+ {"test", "ez", "!", "t!st", true},
+ {"test", "zed", "!?", "t!?st", true},
+ {"test", "t", "!?", "!?es!?", true},
+ {"test", "et", "!>", "!>!>s!>", true},
+ {"test", "zest", "!", "!!!!", true},
+ {"test", "szt", "!", "!e!!", true},
+ {"test", "t", "test", "testestest", true},
+ {"tetst", "t", "test", "testeteststest", true},
+ {"ttttttt", "t", "-", "-------", true},
+ {"aAaAaAAaAAa", "A", "", "aaaaa", true},
+ {"xxxxxxxxxx", "x", "", "", true},
+ {"xxxxxxxxxx", "x", "x", "xxxxxxxxxx", true},
+ {"xxxxxxxxxx", "x", "y-", "y-y-y-y-y-y-y-y-y-y-", true},
+ {"xxxxxxxxxx", "x", "xy", "xyxyxyxyxyxyxyxyxyxy", true},
+ {"xxxxxxxxxx", "x", "zyx", "zyxzyxzyxzyxzyxzyxzyxzyxzyxzyx", true},
+ {"xaxxaxxxaxxxax", "x", "xy", "xyaxyxyaxyxyxyaxyxyxyaxy", true},
+ {"-xaxxaxxxaxxxax-", "x", "xy", "-xyaxyxyaxyxyxyaxyxyxyaxy-", true},
+ };
+
+ for (const TestData& scenario : cases) {
+ // Test with separate output and input vars.
+ std::string output;
+ bool result = ReplaceChars(scenario.input, scenario.replace_chars,
+ scenario.replace_with, &output);
+ EXPECT_EQ(scenario.result, result) << scenario.input;
+ EXPECT_EQ(scenario.output, output);
+ }
+
+ for (const TestData& scenario : cases) {
+ // Test with an input/output var of limited capacity.
+ std::string input_output = scenario.input;
+ input_output.shrink_to_fit();
+ bool result = ReplaceChars(input_output, scenario.replace_chars,
+ scenario.replace_with, &input_output);
+ EXPECT_EQ(scenario.result, result) << scenario.input;
+ EXPECT_EQ(scenario.output, input_output);
+ }
+
+ for (const TestData& scenario : cases) {
+ // Test with an input/output var of ample capacity; should
+ // not realloc.
+ std::string input_output = scenario.input;
+ input_output.reserve(strlen(scenario.output) * 2);
+ const void* original_buffer = input_output.data();
+ bool result = ReplaceChars(input_output, scenario.replace_chars,
+ scenario.replace_with, &input_output);
+ EXPECT_EQ(scenario.result, result) << scenario.input;
+ EXPECT_EQ(scenario.output, input_output);
+ EXPECT_EQ(original_buffer, input_output.data());
+ }
+}
+
+TEST(StringUtilTest, ContainsOnlyChars) {
+ // Providing an empty list of characters should return false but for the empty
+ // string.
+ EXPECT_TRUE(ContainsOnlyChars(std::string(), std::string()));
+ EXPECT_FALSE(ContainsOnlyChars("Hello", std::string()));
+
+ EXPECT_TRUE(ContainsOnlyChars(std::string(), "1234"));
+ EXPECT_TRUE(ContainsOnlyChars("1", "1234"));
+ EXPECT_TRUE(ContainsOnlyChars("1", "4321"));
+ EXPECT_TRUE(ContainsOnlyChars("123", "4321"));
+ EXPECT_FALSE(ContainsOnlyChars("123a", "4321"));
+
+ EXPECT_TRUE(ContainsOnlyChars(std::string(), kWhitespaceASCII));
+ EXPECT_TRUE(ContainsOnlyChars(" ", kWhitespaceASCII));
+ EXPECT_TRUE(ContainsOnlyChars("\t", kWhitespaceASCII));
+ EXPECT_TRUE(ContainsOnlyChars("\t \r \n ", kWhitespaceASCII));
+ EXPECT_FALSE(ContainsOnlyChars("a", kWhitespaceASCII));
+ EXPECT_FALSE(ContainsOnlyChars("\thello\r \n ", kWhitespaceASCII));
+
+ EXPECT_TRUE(ContainsOnlyChars(string16(), kWhitespaceUTF16));
+ EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16(" "), kWhitespaceUTF16));
+ EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16("\t"), kWhitespaceUTF16));
+ EXPECT_TRUE(ContainsOnlyChars(ASCIIToUTF16("\t \r \n "), kWhitespaceUTF16));
+ EXPECT_FALSE(ContainsOnlyChars(ASCIIToUTF16("a"), kWhitespaceUTF16));
+ EXPECT_FALSE(ContainsOnlyChars(ASCIIToUTF16("\thello\r \n "),
+ kWhitespaceUTF16));
+}
+
+TEST(StringUtilTest, CompareCaseInsensitiveASCII) {
+ EXPECT_EQ(0, CompareCaseInsensitiveASCII("", ""));
+ EXPECT_EQ(0, CompareCaseInsensitiveASCII("Asdf", "aSDf"));
+
+ // Differing lengths.
+ EXPECT_EQ(-1, CompareCaseInsensitiveASCII("Asdf", "aSDfA"));
+ EXPECT_EQ(1, CompareCaseInsensitiveASCII("AsdfA", "aSDf"));
+
+ // Differing values.
+ EXPECT_EQ(-1, CompareCaseInsensitiveASCII("AsdfA", "aSDfb"));
+ EXPECT_EQ(1, CompareCaseInsensitiveASCII("Asdfb", "aSDfA"));
+}
+
+TEST(StringUtilTest, EqualsCaseInsensitiveASCII) {
+ EXPECT_TRUE(EqualsCaseInsensitiveASCII("", ""));
+ EXPECT_TRUE(EqualsCaseInsensitiveASCII("Asdf", "aSDF"));
+ EXPECT_FALSE(EqualsCaseInsensitiveASCII("bsdf", "aSDF"));
+ EXPECT_FALSE(EqualsCaseInsensitiveASCII("Asdf", "aSDFz"));
+}
+
+TEST(StringUtilTest, IsUnicodeWhitespace) {
+ // NOT unicode white space.
+ EXPECT_FALSE(IsUnicodeWhitespace(L'\0'));
+ EXPECT_FALSE(IsUnicodeWhitespace(L'A'));
+ EXPECT_FALSE(IsUnicodeWhitespace(L'0'));
+ EXPECT_FALSE(IsUnicodeWhitespace(L'.'));
+ EXPECT_FALSE(IsUnicodeWhitespace(L';'));
+ EXPECT_FALSE(IsUnicodeWhitespace(L'\x4100'));
+
+ // Actual unicode whitespace.
+ EXPECT_TRUE(IsUnicodeWhitespace(L' '));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\xa0'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\x3000'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\t'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\r'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\v'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\f'));
+ EXPECT_TRUE(IsUnicodeWhitespace(L'\n'));
+}
+
+class WriteIntoTest : public testing::Test {
+ protected:
+ static void WritesCorrectly(size_t num_chars) {
+ std::string buffer;
+ char kOriginal[] = "supercali";
+ strncpy(WriteInto(&buffer, num_chars + 1), kOriginal, num_chars);
+ // Using std::string(buffer.c_str()) instead of |buffer| truncates the
+ // string at the first \0.
+ EXPECT_EQ(
+ std::string(kOriginal, std::min(num_chars, gurl_base::size(kOriginal) - 1)),
+ std::string(buffer.c_str()));
+ EXPECT_EQ(num_chars, buffer.size());
+ }
+};
+
+TEST_F(WriteIntoTest, WriteInto) {
+ // Validate that WriteInto reserves enough space and
+ // sizes a string correctly.
+ WritesCorrectly(1);
+ WritesCorrectly(2);
+ WritesCorrectly(5000);
+
+ // Validate that WriteInto doesn't modify other strings
+ // when using a Copy-on-Write implementation.
+ const char kLive[] = "live";
+ const char kDead[] = "dead";
+ const std::string live = kLive;
+ std::string dead = live;
+ strncpy(WriteInto(&dead, 5), kDead, 4);
+ EXPECT_EQ(kDead, dead);
+ EXPECT_EQ(4u, dead.size());
+ EXPECT_EQ(kLive, live);
+ EXPECT_EQ(4u, live.size());
+}
+
+} // namespace base
diff --git a/base/strings/string_util_win.h b/base/strings/string_util_win.h
new file mode 100644
index 0000000..710d574
--- /dev/null
+++ b/base/strings/string_util_win.h
@@ -0,0 +1,44 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRING_UTIL_WIN_H_
+#define BASE_STRINGS_STRING_UTIL_WIN_H_
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
+#include "polyfills/base/logging.h"
+
+namespace gurl_base {
+
+// Chromium code style is to not use malloc'd strings; this is only for use
+// for interaction with APIs that require it.
+inline char* strdup(const char* str) {
+ return _strdup(str);
+}
+
+inline int vsnprintf(char* buffer, size_t size,
+ const char* format, va_list arguments) {
+ int length = vsnprintf_s(buffer, size, size - 1, format, arguments);
+ if (length < 0)
+ return _vscprintf(format, arguments);
+ return length;
+}
+
+inline int vswprintf(wchar_t* buffer, size_t size,
+ const wchar_t* format, va_list arguments) {
+ GURL_DCHECK(IsWprintfFormatPortable(format));
+
+ int length = _vsnwprintf_s(buffer, size, size - 1, format, arguments);
+ if (length < 0)
+ return _vscwprintf(format, arguments);
+ return length;
+}
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRING_UTIL_WIN_H_
diff --git a/base/strings/stringize_macros.h b/base/strings/stringize_macros.h
new file mode 100644
index 0000000..d4e2707
--- /dev/null
+++ b/base/strings/stringize_macros.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This file defines preprocessor macros for stringizing preprocessor
+// symbols (or their output) and manipulating preprocessor symbols
+// that define strings.
+
+#ifndef BASE_STRINGS_STRINGIZE_MACROS_H_
+#define BASE_STRINGS_STRINGIZE_MACROS_H_
+
+#include "build/build_config.h"
+
+// This is not very useful as it does not expand defined symbols if
+// called directly. Use its counterpart without the _NO_EXPANSION
+// suffix, below.
+#define STRINGIZE_NO_EXPANSION(x) #x
+
+// Use this to quote the provided parameter, first expanding it if it
+// is a preprocessor symbol.
+//
+// For example, if:
+// #define A FOO
+// #define B(x) myobj->FunctionCall(x)
+//
+// Then:
+// STRINGIZE(A) produces "FOO"
+// STRINGIZE(B(y)) produces "myobj->FunctionCall(y)"
+#define STRINGIZE(x) STRINGIZE_NO_EXPANSION(x)
+
+#endif // BASE_STRINGS_STRINGIZE_MACROS_H_
diff --git a/base/strings/stringize_macros_unittest.cc b/base/strings/stringize_macros_unittest.cc
new file mode 100644
index 0000000..d7f9e56
--- /dev/null
+++ b/base/strings/stringize_macros_unittest.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/stringize_macros.h"
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+// Macros as per documentation in header file.
+#define PREPROCESSOR_UTIL_UNITTEST_A FOO
+#define PREPROCESSOR_UTIL_UNITTEST_B(x) myobj->FunctionCall(x)
+#define PREPROCESSOR_UTIL_UNITTEST_C "foo"
+
+TEST(StringizeTest, Ansi) {
+ EXPECT_STREQ(
+ "PREPROCESSOR_UTIL_UNITTEST_A",
+ STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_A));
+ EXPECT_STREQ(
+ "PREPROCESSOR_UTIL_UNITTEST_B(y)",
+ STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_B(y)));
+ EXPECT_STREQ(
+ "PREPROCESSOR_UTIL_UNITTEST_C",
+ STRINGIZE_NO_EXPANSION(PREPROCESSOR_UTIL_UNITTEST_C));
+
+ EXPECT_STREQ("FOO", STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_A));
+ EXPECT_STREQ("myobj->FunctionCall(y)",
+ STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_B(y)));
+ EXPECT_STREQ("\"foo\"", STRINGIZE(PREPROCESSOR_UTIL_UNITTEST_C));
+}
diff --git a/base/strings/stringprintf.cc b/base/strings/stringprintf.cc
new file mode 100644
index 0000000..1a08ffb
--- /dev/null
+++ b/base/strings/stringprintf.cc
@@ -0,0 +1,187 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/stringprintf.h"
+
+#include <errno.h>
+#include <stddef.h>
+
+#include <vector>
+
+#include "base/scoped_clear_last_error.h"
+#include "base/stl_util.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+namespace {
+
+// Overloaded wrappers around vsnprintf and vswprintf. The buf_size parameter
+// is the size of the buffer. These return the number of characters in the
+// formatted string excluding the NUL terminator. If the buffer is not
+// large enough to accommodate the formatted string without truncation, they
+// return the number of characters that would be in the fully-formatted string
+// (vsnprintf, and vswprintf on Windows), or -1 (vswprintf on POSIX platforms).
+inline int vsnprintfT(char* buffer,
+ size_t buf_size,
+ const char* format,
+ va_list argptr) {
+ return gurl_base::vsnprintf(buffer, buf_size, format, argptr);
+}
+
+#if defined(OS_WIN)
+inline int vsnprintfT(wchar_t* buffer,
+ size_t buf_size,
+ const wchar_t* format,
+ va_list argptr) {
+ return gurl_base::vswprintf(buffer, buf_size, format, argptr);
+}
+#endif
+
+// Templatized backend for StringPrintF/StringAppendF. This does not finalize
+// the va_list, the caller is expected to do that.
+template <class StringType>
+static void StringAppendVT(StringType* dst,
+ const typename StringType::value_type* format,
+ va_list ap) {
+ // First try with a small fixed size buffer.
+ // This buffer size should be kept in sync with StringUtilTest.GrowBoundary
+ // and StringUtilTest.StringPrintfBounds.
+ typename StringType::value_type stack_buf[1024];
+
+ va_list ap_copy;
+ va_copy(ap_copy, ap);
+
+ gurl_base::internal::ScopedClearLastError last_error;
+ int result = vsnprintfT(stack_buf, gurl_base::size(stack_buf), format, ap_copy);
+ va_end(ap_copy);
+
+ if (result >= 0 && result < static_cast<int>(gurl_base::size(stack_buf))) {
+ // It fit.
+ dst->append(stack_buf, result);
+ return;
+ }
+
+ // Repeatedly increase buffer size until it fits.
+ int mem_length = gurl_base::size(stack_buf);
+ while (true) {
+ if (result < 0) {
+#if defined(OS_WIN)
+ // On Windows, vsnprintfT always returns the number of characters in a
+ // fully-formatted string, so if we reach this point, something else is
+ // wrong and no amount of buffer-doubling is going to fix it.
+ return;
+#else
+ if (errno != 0 && errno != EOVERFLOW)
+ return;
+ // Try doubling the buffer size.
+ mem_length *= 2;
+#endif
+ } else {
+ // We need exactly "result + 1" characters.
+ mem_length = result + 1;
+ }
+
+ if (mem_length > 32 * 1024 * 1024) {
+ // That should be plenty, don't try anything larger. This protects
+ // against huge allocations when using vsnprintfT implementations that
+ // return -1 for reasons other than overflow without setting errno.
+ GURL_DLOG(WARNING) << "Unable to printf the requested string due to size.";
+ return;
+ }
+
+ std::vector<typename StringType::value_type> mem_buf(mem_length);
+
+ // NOTE: You can only use a va_list once. Since we're in a while loop, we
+ // need to make a new copy each time so we don't use up the original.
+ va_copy(ap_copy, ap);
+ result = vsnprintfT(&mem_buf[0], mem_length, format, ap_copy);
+ va_end(ap_copy);
+
+ if ((result >= 0) && (result < mem_length)) {
+ // It fit.
+ dst->append(&mem_buf[0], result);
+ return;
+ }
+ }
+}
+
+} // namespace
+
+std::string StringPrintf(const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ std::string result;
+ StringAppendV(&result, format, ap);
+ va_end(ap);
+ return result;
+}
+
+#if defined(OS_WIN)
+std::wstring StringPrintf(const wchar_t* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ std::wstring result;
+ StringAppendV(&result, format, ap);
+ va_end(ap);
+ return result;
+}
+#endif
+
+std::string StringPrintV(const char* format, va_list ap) {
+ std::string result;
+ StringAppendV(&result, format, ap);
+ return result;
+}
+
+const std::string& SStringPrintf(std::string* dst, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ dst->clear();
+ StringAppendV(dst, format, ap);
+ va_end(ap);
+ return *dst;
+}
+
+#if defined(OS_WIN)
+const std::wstring& SStringPrintf(std::wstring* dst,
+ const wchar_t* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ dst->clear();
+ StringAppendV(dst, format, ap);
+ va_end(ap);
+ return *dst;
+}
+#endif
+
+void StringAppendF(std::string* dst, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ StringAppendV(dst, format, ap);
+ va_end(ap);
+}
+
+#if defined(OS_WIN)
+void StringAppendF(std::wstring* dst, const wchar_t* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ StringAppendV(dst, format, ap);
+ va_end(ap);
+}
+#endif
+
+void StringAppendV(std::string* dst, const char* format, va_list ap) {
+ StringAppendVT(dst, format, ap);
+}
+
+#if defined(OS_WIN)
+void StringAppendV(std::wstring* dst, const wchar_t* format, va_list ap) {
+ StringAppendVT(dst, format, ap);
+}
+#endif
+
+} // namespace base
diff --git a/base/strings/stringprintf.h b/base/strings/stringprintf.h
new file mode 100644
index 0000000..2abdb68
--- /dev/null
+++ b/base/strings/stringprintf.h
@@ -0,0 +1,60 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_STRINGPRINTF_H_
+#define BASE_STRINGS_STRINGPRINTF_H_
+
+#include <stdarg.h> // va_list
+
+#include <string>
+
+#include "polyfills/base/base_export.h"
+#include "base/compiler_specific.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+// Return a C++ string given printf-like input.
+BASE_EXPORT std::string StringPrintf(const char* format, ...)
+ PRINTF_FORMAT(1, 2) WARN_UNUSED_RESULT;
+#if defined(OS_WIN)
+BASE_EXPORT std::wstring StringPrintf(const wchar_t* format, ...)
+ WPRINTF_FORMAT(1, 2) WARN_UNUSED_RESULT;
+#endif
+
+// Return a C++ string given vprintf-like input.
+BASE_EXPORT std::string StringPrintV(const char* format, va_list ap)
+ PRINTF_FORMAT(1, 0) WARN_UNUSED_RESULT;
+
+// Store result into a supplied string and return it.
+BASE_EXPORT const std::string& SStringPrintf(std::string* dst,
+ const char* format,
+ ...) PRINTF_FORMAT(2, 3);
+#if defined(OS_WIN)
+BASE_EXPORT const std::wstring& SStringPrintf(std::wstring* dst,
+ const wchar_t* format,
+ ...) WPRINTF_FORMAT(2, 3);
+#endif
+
+// Append result to a supplied string.
+BASE_EXPORT void StringAppendF(std::string* dst, const char* format, ...)
+ PRINTF_FORMAT(2, 3);
+#if defined(OS_WIN)
+BASE_EXPORT void StringAppendF(std::wstring* dst, const wchar_t* format, ...)
+ WPRINTF_FORMAT(2, 3);
+#endif
+
+// Lower-level routine that takes a va_list and appends to a specified
+// string. All other routines are just convenience wrappers around it.
+BASE_EXPORT void StringAppendV(std::string* dst, const char* format, va_list ap)
+ PRINTF_FORMAT(2, 0);
+#if defined(OS_WIN)
+BASE_EXPORT void StringAppendV(std::wstring* dst,
+ const wchar_t* format,
+ va_list ap) WPRINTF_FORMAT(2, 0);
+#endif
+
+} // namespace base
+
+#endif // BASE_STRINGS_STRINGPRINTF_H_
diff --git a/base/strings/stringprintf_unittest.cc b/base/strings/stringprintf_unittest.cc
new file mode 100644
index 0000000..59e3403
--- /dev/null
+++ b/base/strings/stringprintf_unittest.cc
@@ -0,0 +1,182 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/stringprintf.h"
+
+#include <errno.h>
+#include <stddef.h>
+
+#include "base/macros.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+namespace {
+
+// A helper for the StringAppendV test that follows.
+//
+// Just forwards its args to StringAppendV.
+static void StringAppendVTestHelper(std::string* out, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ StringAppendV(out, format, ap);
+ va_end(ap);
+}
+
+} // namespace
+
+TEST(StringPrintfTest, StringPrintfEmpty) {
+ EXPECT_EQ("", StringPrintf("%s", ""));
+}
+
+TEST(StringPrintfTest, StringPrintfMisc) {
+ EXPECT_EQ("123hello w", StringPrintf("%3d%2s %1c", 123, "hello", 'w'));
+#if defined(OS_WIN)
+ EXPECT_EQ(L"123hello w", StringPrintf(L"%3d%2ls %1lc", 123, L"hello", 'w'));
+#endif
+}
+
+TEST(StringPrintfTest, StringAppendfEmptyString) {
+ std::string value("Hello");
+ StringAppendF(&value, "%s", "");
+ EXPECT_EQ("Hello", value);
+
+#if defined(OS_WIN)
+ std::wstring valuew(L"Hello");
+ StringAppendF(&valuew, L"%ls", L"");
+ EXPECT_EQ(L"Hello", valuew);
+#endif
+}
+
+TEST(StringPrintfTest, StringAppendfString) {
+ std::string value("Hello");
+ StringAppendF(&value, " %s", "World");
+ EXPECT_EQ("Hello World", value);
+
+#if defined(OS_WIN)
+ std::wstring valuew(L"Hello");
+ StringAppendF(&valuew, L" %ls", L"World");
+ EXPECT_EQ(L"Hello World", valuew);
+#endif
+}
+
+TEST(StringPrintfTest, StringAppendfInt) {
+ std::string value("Hello");
+ StringAppendF(&value, " %d", 123);
+ EXPECT_EQ("Hello 123", value);
+
+#if defined(OS_WIN)
+ std::wstring valuew(L"Hello");
+ StringAppendF(&valuew, L" %d", 123);
+ EXPECT_EQ(L"Hello 123", valuew);
+#endif
+}
+
+// Make sure that lengths exactly around the initial buffer size are handled
+// correctly.
+TEST(StringPrintfTest, StringPrintfBounds) {
+ const int kSrcLen = 1026;
+ char src[kSrcLen];
+ for (auto& i : src)
+ i = 'A';
+
+ wchar_t srcw[kSrcLen];
+ for (auto& i : srcw)
+ i = 'A';
+
+ for (int i = 1; i < 3; i++) {
+ src[kSrcLen - i] = 0;
+ std::string out;
+ SStringPrintf(&out, "%s", src);
+ EXPECT_STREQ(src, out.c_str());
+
+#if defined(OS_WIN)
+ srcw[kSrcLen - i] = 0;
+ std::wstring outw;
+ SStringPrintf(&outw, L"%ls", srcw);
+ EXPECT_STREQ(srcw, outw.c_str());
+#endif
+ }
+}
+
+// Test very large sprintfs that will cause the buffer to grow.
+TEST(StringPrintfTest, Grow) {
+ char src[1026];
+ for (auto& i : src)
+ i = 'A';
+ src[1025] = 0;
+
+ const char fmt[] = "%sB%sB%sB%sB%sB%sB%s";
+
+ std::string out;
+ SStringPrintf(&out, fmt, src, src, src, src, src, src, src);
+
+ const int kRefSize = 320000;
+ char* ref = new char[kRefSize];
+#if defined(OS_WIN)
+ sprintf_s(ref, kRefSize, fmt, src, src, src, src, src, src, src);
+#elif defined(OS_POSIX) || defined(OS_FUCHSIA)
+ snprintf(ref, kRefSize, fmt, src, src, src, src, src, src, src);
+#endif
+
+ EXPECT_STREQ(ref, out.c_str());
+ delete[] ref;
+}
+
+TEST(StringPrintfTest, StringAppendV) {
+ std::string out;
+ StringAppendVTestHelper(&out, "%d foo %s", 1, "bar");
+ EXPECT_EQ("1 foo bar", out);
+}
+
+// Test the boundary condition for the size of the string_util's
+// internal buffer.
+TEST(StringPrintfTest, GrowBoundary) {
+ const int kStringUtilBufLen = 1024;
+ // Our buffer should be one larger than the size of StringAppendVT's stack
+ // buffer.
+ // And need extra one for NULL-terminator.
+ const int kBufLen = kStringUtilBufLen + 1 + 1;
+ char src[kBufLen];
+ for (int i = 0; i < kBufLen - 1; ++i)
+ src[i] = 'a';
+ src[kBufLen - 1] = 0;
+
+ std::string out;
+ SStringPrintf(&out, "%s", src);
+
+ EXPECT_STREQ(src, out.c_str());
+}
+
+#if defined(OS_WIN)
+// vswprintf in Visual Studio 2013 fails when given U+FFFF. This tests that the
+// failure case is gracefuly handled. In Visual Studio 2015 the bad character
+// is passed through.
+TEST(StringPrintfTest, Invalid) {
+ wchar_t invalid[2];
+ invalid[0] = 0xffff;
+ invalid[1] = 0;
+
+ std::wstring out;
+ SStringPrintf(&out, L"%ls", invalid);
+#if _MSC_VER >= 1900
+ EXPECT_STREQ(invalid, out.c_str());
+#else
+ EXPECT_STREQ(L"", out.c_str());
+#endif
+}
+#endif
+
+// Test that StringPrintf and StringAppendV do not change errno.
+TEST(StringPrintfTest, StringPrintfErrno) {
+ errno = 1;
+ EXPECT_EQ("", StringPrintf("%s", ""));
+ EXPECT_EQ(1, errno);
+ std::string out;
+ StringAppendVTestHelper(&out, "%d foo %s", 1, "bar");
+ EXPECT_EQ(1, errno);
+}
+
+} // namespace base
diff --git a/base/strings/sys_string_conversions.h b/base/strings/sys_string_conversions.h
new file mode 100644
index 0000000..08082ae
--- /dev/null
+++ b/base/strings/sys_string_conversions.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_SYS_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_SYS_STRING_CONVERSIONS_H_
+
+// Provides system-dependent string type conversions for cases where it's
+// necessary to not use ICU. Generally, you should not need this in Chrome,
+// but it is used in some shared code. Dependencies should be minimal.
+
+#include <stdint.h>
+
+#include <string>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "build/build_config.h"
+
+#if defined(OS_MACOSX)
+#include <CoreFoundation/CoreFoundation.h>
+#ifdef __OBJC__
+@class NSString;
+#else
+class NSString;
+#endif
+#endif // OS_MACOSX
+
+namespace gurl_base {
+
+// Converts between wide and UTF-8 representations of a string. On error, the
+// result is system-dependent.
+BASE_EXPORT std::string SysWideToUTF8(const std::wstring& wide);
+BASE_EXPORT std::wstring SysUTF8ToWide(StringPiece utf8);
+
+// Converts between wide and the system multi-byte representations of a string.
+// DANGER: This will lose information and can change (on Windows, this can
+// change between reboots).
+BASE_EXPORT std::string SysWideToNativeMB(const std::wstring& wide);
+BASE_EXPORT std::wstring SysNativeMBToWide(StringPiece native_mb);
+
+// Windows-specific ------------------------------------------------------------
+
+#if defined(OS_WIN)
+
+// Converts between 8-bit and wide strings, using the given code page. The
+// code page identifier is one accepted by the Windows function
+// MultiByteToWideChar().
+BASE_EXPORT std::wstring SysMultiByteToWide(StringPiece mb, uint32_t code_page);
+BASE_EXPORT std::string SysWideToMultiByte(const std::wstring& wide,
+ uint32_t code_page);
+
+#endif // defined(OS_WIN)
+
+// Mac-specific ----------------------------------------------------------------
+
+#if defined(OS_MACOSX)
+
+// Converts between STL strings and CFStringRefs/NSStrings.
+
+// Creates a string, and returns it with a refcount of 1. You are responsible
+// for releasing it. Returns NULL on failure.
+BASE_EXPORT CFStringRef SysUTF8ToCFStringRef(StringPiece utf8);
+BASE_EXPORT CFStringRef SysUTF16ToCFStringRef(StringPiece16 utf16);
+
+// Same, but returns an autoreleased NSString.
+BASE_EXPORT NSString* SysUTF8ToNSString(StringPiece utf8);
+BASE_EXPORT NSString* SysUTF16ToNSString(StringPiece16 utf16);
+
+// Converts a CFStringRef to an STL string. Returns an empty string on failure.
+BASE_EXPORT std::string SysCFStringRefToUTF8(CFStringRef ref);
+BASE_EXPORT string16 SysCFStringRefToUTF16(CFStringRef ref);
+
+// Same, but accepts NSString input. Converts nil NSString* to the appropriate
+// string type of length 0.
+BASE_EXPORT std::string SysNSStringToUTF8(NSString* ref);
+BASE_EXPORT string16 SysNSStringToUTF16(NSString* ref);
+
+#endif // defined(OS_MACOSX)
+
+} // namespace base
+
+#endif // BASE_STRINGS_SYS_STRING_CONVERSIONS_H_
diff --git a/base/strings/sys_string_conversions_posix.cc b/base/strings/sys_string_conversions_posix.cc
new file mode 100644
index 0000000..80f01e6
--- /dev/null
+++ b/base/strings/sys_string_conversions_posix.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/sys_string_conversions.h"
+
+#include <stddef.h>
+#include <wchar.h>
+
+#include "base/strings/string_piece.h"
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+std::string SysWideToUTF8(const std::wstring& wide) {
+ // In theory this should be using the system-provided conversion rather
+ // than our ICU, but this will do for now.
+ return WideToUTF8(wide);
+}
+std::wstring SysUTF8ToWide(StringPiece utf8) {
+ // In theory this should be using the system-provided conversion rather
+ // than our ICU, but this will do for now.
+ std::wstring out;
+ UTF8ToWide(utf8.data(), utf8.size(), &out);
+ return out;
+}
+
+#if defined(SYSTEM_NATIVE_UTF8) || defined(OS_ANDROID)
+// TODO(port): Consider reverting the OS_ANDROID when we have wcrtomb()
+// support and a better understanding of what calls these routines.
+
+std::string SysWideToNativeMB(const std::wstring& wide) {
+ return WideToUTF8(wide);
+}
+
+std::wstring SysNativeMBToWide(StringPiece native_mb) {
+ return SysUTF8ToWide(native_mb);
+}
+
+#else
+
+std::string SysWideToNativeMB(const std::wstring& wide) {
+ mbstate_t ps;
+
+ // Calculate the number of multi-byte characters. We walk through the string
+ // without writing the output, counting the number of multi-byte characters.
+ size_t num_out_chars = 0;
+ memset(&ps, 0, sizeof(ps));
+ for (auto src : wide) {
+ // Use a temp buffer since calling wcrtomb with an output of NULL does not
+ // calculate the output length.
+ char buf[16];
+ // Skip NULLs to avoid wcrtomb's special handling of them.
+ size_t res = src ? wcrtomb(buf, src, &ps) : 0;
+ switch (res) {
+ // Handle any errors and return an empty string.
+ case static_cast<size_t>(-1):
+ return std::string();
+ break;
+ case 0:
+ // We hit an embedded null byte, keep going.
+ ++num_out_chars;
+ break;
+ default:
+ num_out_chars += res;
+ break;
+ }
+ }
+
+ if (num_out_chars == 0)
+ return std::string();
+
+ std::string out;
+ out.resize(num_out_chars);
+
+ // We walk the input string again, with |i| tracking the index of the
+ // wide input, and |j| tracking the multi-byte output.
+ memset(&ps, 0, sizeof(ps));
+ for (size_t i = 0, j = 0; i < wide.size(); ++i) {
+ const wchar_t src = wide[i];
+ // We don't want wcrtomb to do its funkiness for embedded NULLs.
+ size_t res = src ? wcrtomb(&out[j], src, &ps) : 0;
+ switch (res) {
+ // Handle any errors and return an empty string.
+ case static_cast<size_t>(-1):
+ return std::string();
+ break;
+ case 0:
+ // We hit an embedded null byte, keep going.
+ ++j; // Output is already zeroed.
+ break;
+ default:
+ j += res;
+ break;
+ }
+ }
+
+ return out;
+}
+
+std::wstring SysNativeMBToWide(StringPiece native_mb) {
+ mbstate_t ps;
+
+ // Calculate the number of wide characters. We walk through the string
+ // without writing the output, counting the number of wide characters.
+ size_t num_out_chars = 0;
+ memset(&ps, 0, sizeof(ps));
+ for (size_t i = 0; i < native_mb.size(); ) {
+ const char* src = native_mb.data() + i;
+ size_t res = mbrtowc(nullptr, src, native_mb.size() - i, &ps);
+ switch (res) {
+ // Handle any errors and return an empty string.
+ case static_cast<size_t>(-2):
+ case static_cast<size_t>(-1):
+ return std::wstring();
+ break;
+ case 0:
+ // We hit an embedded null byte, keep going.
+ i += 1;
+ FALLTHROUGH;
+ default:
+ i += res;
+ ++num_out_chars;
+ break;
+ }
+ }
+
+ if (num_out_chars == 0)
+ return std::wstring();
+
+ std::wstring out;
+ out.resize(num_out_chars);
+
+ memset(&ps, 0, sizeof(ps)); // Clear the shift state.
+ // We walk the input string again, with |i| tracking the index of the
+ // multi-byte input, and |j| tracking the wide output.
+ for (size_t i = 0, j = 0; i < native_mb.size(); ++j) {
+ const char* src = native_mb.data() + i;
+ wchar_t* dst = &out[j];
+ size_t res = mbrtowc(dst, src, native_mb.size() - i, &ps);
+ switch (res) {
+ // Handle any errors and return an empty string.
+ case static_cast<size_t>(-2):
+ case static_cast<size_t>(-1):
+ return std::wstring();
+ break;
+ case 0:
+ i += 1; // Skip null byte.
+ break;
+ default:
+ i += res;
+ break;
+ }
+ }
+
+ return out;
+}
+
+#endif // defined(SYSTEM_NATIVE_UTF8) || defined(OS_ANDROID)
+
+} // namespace base
diff --git a/base/strings/sys_string_conversions_unittest.cc b/base/strings/sys_string_conversions_unittest.cc
new file mode 100644
index 0000000..0e78d43
--- /dev/null
+++ b/base/strings/sys_string_conversions_unittest.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <string>
+
+#include "base/macros.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/sys_string_conversions.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/test/scoped_locale.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+#ifdef WCHAR_T_IS_UTF32
+static const std::wstring kSysWideOldItalicLetterA = L"\x10300";
+#else
+static const std::wstring kSysWideOldItalicLetterA = L"\xd800\xdf00";
+#endif
+
+namespace gurl_base {
+
+TEST(SysStrings, SysWideToUTF8) {
+ EXPECT_EQ("Hello, world", SysWideToUTF8(L"Hello, world"));
+ EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToUTF8(L"\x4f60\x597d"));
+
+ // >16 bits
+ EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToUTF8(kSysWideOldItalicLetterA));
+
+ // Error case. When Windows finds a UTF-16 character going off the end of
+ // a string, it just converts that literal value to UTF-8, even though this
+ // is invalid.
+ //
+ // This is what XP does, but Vista has different behavior, so we don't bother
+ // verifying it:
+ // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
+ // SysWideToUTF8(L"\x4f60\xd800zyxw"));
+
+ // Test embedded NULLs.
+ std::wstring wide_null(L"a");
+ wide_null.push_back(0);
+ wide_null.push_back('b');
+
+ std::string expected_null("a");
+ expected_null.push_back(0);
+ expected_null.push_back('b');
+
+ EXPECT_EQ(expected_null, SysWideToUTF8(wide_null));
+}
+
+TEST(SysStrings, SysUTF8ToWide) {
+ EXPECT_EQ(L"Hello, world", SysUTF8ToWide("Hello, world"));
+ EXPECT_EQ(L"\x4f60\x597d", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
+ // >16 bits
+ EXPECT_EQ(kSysWideOldItalicLetterA, SysUTF8ToWide("\xF0\x90\x8C\x80"));
+
+ // Error case. When Windows finds an invalid UTF-8 character, it just skips
+ // it. This seems weird because it's inconsistent with the reverse conversion.
+ //
+ // This is what XP does, but Vista has different behavior, so we don't bother
+ // verifying it:
+ // EXPECT_EQ(L"\x4f60zyxw", SysUTF8ToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
+
+ // Test embedded NULLs.
+ std::string utf8_null("a");
+ utf8_null.push_back(0);
+ utf8_null.push_back('b');
+
+ std::wstring expected_null(L"a");
+ expected_null.push_back(0);
+ expected_null.push_back('b');
+
+ EXPECT_EQ(expected_null, SysUTF8ToWide(utf8_null));
+}
+
+#if defined(OS_LINUX) // Tests depend on setting a specific Linux locale.
+
+TEST(SysStrings, SysWideToNativeMB) {
+#if !defined(SYSTEM_NATIVE_UTF8)
+ ScopedLocale locale("en_US.UTF-8");
+#endif
+ EXPECT_EQ("Hello, world", SysWideToNativeMB(L"Hello, world"));
+ EXPECT_EQ("\xe4\xbd\xa0\xe5\xa5\xbd", SysWideToNativeMB(L"\x4f60\x597d"));
+
+ // >16 bits
+ EXPECT_EQ("\xF0\x90\x8C\x80", SysWideToNativeMB(kSysWideOldItalicLetterA));
+
+ // Error case. When Windows finds a UTF-16 character going off the end of
+ // a string, it just converts that literal value to UTF-8, even though this
+ // is invalid.
+ //
+ // This is what XP does, but Vista has different behavior, so we don't bother
+ // verifying it:
+ // EXPECT_EQ("\xE4\xBD\xA0\xED\xA0\x80zyxw",
+ // SysWideToNativeMB(L"\x4f60\xd800zyxw"));
+
+ // Test embedded NULLs.
+ std::wstring wide_null(L"a");
+ wide_null.push_back(0);
+ wide_null.push_back('b');
+
+ std::string expected_null("a");
+ expected_null.push_back(0);
+ expected_null.push_back('b');
+
+ EXPECT_EQ(expected_null, SysWideToNativeMB(wide_null));
+}
+
+// We assume the test is running in a UTF8 locale.
+TEST(SysStrings, SysNativeMBToWide) {
+#if !defined(SYSTEM_NATIVE_UTF8)
+ ScopedLocale locale("en_US.UTF-8");
+#endif
+ EXPECT_EQ(L"Hello, world", SysNativeMBToWide("Hello, world"));
+ EXPECT_EQ(L"\x4f60\x597d", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5\xbd"));
+ // >16 bits
+ EXPECT_EQ(kSysWideOldItalicLetterA, SysNativeMBToWide("\xF0\x90\x8C\x80"));
+
+ // Error case. When Windows finds an invalid UTF-8 character, it just skips
+ // it. This seems weird because it's inconsistent with the reverse conversion.
+ //
+ // This is what XP does, but Vista has different behavior, so we don't bother
+ // verifying it:
+ // EXPECT_EQ(L"\x4f60zyxw", SysNativeMBToWide("\xe4\xbd\xa0\xe5\xa5zyxw"));
+
+ // Test embedded NULLs.
+ std::string utf8_null("a");
+ utf8_null.push_back(0);
+ utf8_null.push_back('b');
+
+ std::wstring expected_null(L"a");
+ expected_null.push_back(0);
+ expected_null.push_back('b');
+
+ EXPECT_EQ(expected_null, SysNativeMBToWide(utf8_null));
+}
+
+static const wchar_t* const kConvertRoundtripCases[] = {
+ L"Google Video",
+ // "网页 图片 资讯更多 »"
+ L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
+ // "Παγκόσμιος Ιστός"
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
+ // "Поиск страниц на русском"
+ L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
+ L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
+ L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
+ // "전체서비스"
+ L"\xc804\xccb4\xc11c\xbe44\xc2a4",
+
+ // Test characters that take more than 16 bits. This will depend on whether
+ // wchar_t is 16 or 32 bits.
+#if defined(WCHAR_T_IS_UTF16)
+ L"\xd800\xdf00",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
+#elif defined(WCHAR_T_IS_UTF32)
+ L"\x10300",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\x11d40\x11d41\x11d42\x11d43\x11d44",
+#endif
+};
+
+
+TEST(SysStrings, SysNativeMBAndWide) {
+#if !defined(SYSTEM_NATIVE_UTF8)
+ ScopedLocale locale("en_US.UTF-8");
+#endif
+ for (auto* i : kConvertRoundtripCases) {
+ std::wstring wide = i;
+ std::wstring trip = SysNativeMBToWide(SysWideToNativeMB(wide));
+ EXPECT_EQ(wide.size(), trip.size());
+ EXPECT_EQ(wide, trip);
+ }
+
+ // We assume our test is running in UTF-8, so double check through ICU.
+ for (auto* i : kConvertRoundtripCases) {
+ std::wstring wide = i;
+ std::wstring trip = SysNativeMBToWide(WideToUTF8(wide));
+ EXPECT_EQ(wide.size(), trip.size());
+ EXPECT_EQ(wide, trip);
+ }
+
+ for (auto* i : kConvertRoundtripCases) {
+ std::wstring wide = i;
+ std::wstring trip = UTF8ToWide(SysWideToNativeMB(wide));
+ EXPECT_EQ(wide.size(), trip.size());
+ EXPECT_EQ(wide, trip);
+ }
+}
+#endif // OS_LINUX
+
+} // namespace base
diff --git a/base/strings/sys_string_conversions_win.cc b/base/strings/sys_string_conversions_win.cc
new file mode 100644
index 0000000..3f08956
--- /dev/null
+++ b/base/strings/sys_string_conversions_win.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/sys_string_conversions.h"
+
+#include <windows.h>
+#include <stdint.h>
+
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+// Do not assert in this function since it is used by the asssertion code!
+std::string SysWideToUTF8(const std::wstring& wide) {
+ return SysWideToMultiByte(wide, CP_UTF8);
+}
+
+// Do not assert in this function since it is used by the asssertion code!
+std::wstring SysUTF8ToWide(StringPiece utf8) {
+ return SysMultiByteToWide(utf8, CP_UTF8);
+}
+
+std::string SysWideToNativeMB(const std::wstring& wide) {
+ return SysWideToMultiByte(wide, CP_ACP);
+}
+
+std::wstring SysNativeMBToWide(StringPiece native_mb) {
+ return SysMultiByteToWide(native_mb, CP_ACP);
+}
+
+// Do not assert in this function since it is used by the asssertion code!
+std::wstring SysMultiByteToWide(StringPiece mb, uint32_t code_page) {
+ if (mb.empty())
+ return std::wstring();
+
+ int mb_length = static_cast<int>(mb.length());
+ // Compute the length of the buffer.
+ int charcount = MultiByteToWideChar(code_page, 0,
+ mb.data(), mb_length, NULL, 0);
+ if (charcount == 0)
+ return std::wstring();
+
+ std::wstring wide;
+ wide.resize(charcount);
+ MultiByteToWideChar(code_page, 0, mb.data(), mb_length, &wide[0], charcount);
+
+ return wide;
+}
+
+// Do not assert in this function since it is used by the asssertion code!
+std::string SysWideToMultiByte(const std::wstring& wide, uint32_t code_page) {
+ int wide_length = static_cast<int>(wide.length());
+ if (wide_length == 0)
+ return std::string();
+
+ // Compute the length of the buffer we'll need.
+ int charcount = WideCharToMultiByte(code_page, 0, wide.data(), wide_length,
+ NULL, 0, NULL, NULL);
+ if (charcount == 0)
+ return std::string();
+
+ std::string mb;
+ mb.resize(charcount);
+ WideCharToMultiByte(code_page, 0, wide.data(), wide_length,
+ &mb[0], charcount, NULL, NULL);
+
+ return mb;
+}
+
+} // namespace base
diff --git a/base/strings/utf_offset_string_conversions.cc b/base/strings/utf_offset_string_conversions.cc
new file mode 100644
index 0000000..5bf7967
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.cc
@@ -0,0 +1,263 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_offset_string_conversions.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "polyfills/base/logging.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/utf_string_conversion_utils.h"
+
+namespace gurl_base {
+
+OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
+ size_t original_length,
+ size_t output_length)
+ : original_offset(original_offset),
+ original_length(original_length),
+ output_length(output_length) {
+}
+
+// static
+void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
+ std::vector<size_t>* offsets_for_adjustment,
+ size_t limit) {
+ GURL_DCHECK(offsets_for_adjustment);
+ for (auto& i : *offsets_for_adjustment)
+ AdjustOffset(adjustments, &i, limit);
+}
+
+// static
+void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
+ size_t* offset,
+ size_t limit) {
+ GURL_DCHECK(offset);
+ if (*offset == string16::npos)
+ return;
+ int adjustment = 0;
+ for (const auto& i : adjustments) {
+ if (*offset <= i.original_offset)
+ break;
+ if (*offset < (i.original_offset + i.original_length)) {
+ *offset = string16::npos;
+ return;
+ }
+ adjustment += static_cast<int>(i.original_length - i.output_length);
+ }
+ *offset -= adjustment;
+
+ if (*offset > limit)
+ *offset = string16::npos;
+}
+
+// static
+void OffsetAdjuster::UnadjustOffsets(
+ const Adjustments& adjustments,
+ std::vector<size_t>* offsets_for_unadjustment) {
+ if (!offsets_for_unadjustment || adjustments.empty())
+ return;
+ for (auto& i : *offsets_for_unadjustment)
+ UnadjustOffset(adjustments, &i);
+}
+
+// static
+void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
+ size_t* offset) {
+ if (*offset == string16::npos)
+ return;
+ int adjustment = 0;
+ for (const auto& i : adjustments) {
+ if (*offset + adjustment <= i.original_offset)
+ break;
+ adjustment += static_cast<int>(i.original_length - i.output_length);
+ if ((*offset + adjustment) < (i.original_offset + i.original_length)) {
+ *offset = string16::npos;
+ return;
+ }
+ }
+ *offset += adjustment;
+}
+
+// static
+void OffsetAdjuster::MergeSequentialAdjustments(
+ const Adjustments& first_adjustments,
+ Adjustments* adjustments_on_adjusted_string) {
+ auto adjusted_iter = adjustments_on_adjusted_string->begin();
+ auto first_iter = first_adjustments.begin();
+ // Simultaneously iterate over all |adjustments_on_adjusted_string| and
+ // |first_adjustments|, adding adjustments to or correcting the adjustments
+ // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the
+ // current number of characters collapsed by |first_adjustments| up to this
+ // point. |currently_collapsing| keeps track of the number of characters
+ // collapsed by |first_adjustments| into the current |adjusted_iter|'s
+ // length. These are characters that will change |shift| as soon as we're
+ // done processing the current |adjusted_iter|; they are not yet reflected in
+ // |shift|.
+ size_t shift = 0;
+ size_t currently_collapsing = 0;
+ while (adjusted_iter != adjustments_on_adjusted_string->end()) {
+ if ((first_iter == first_adjustments.end()) ||
+ ((adjusted_iter->original_offset + shift +
+ adjusted_iter->original_length) <= first_iter->original_offset)) {
+ // Entire |adjusted_iter| (accounting for its shift and including its
+ // whole original length) comes before |first_iter|.
+ //
+ // Correct the offset at |adjusted_iter| and move onto the next
+ // adjustment that needs revising.
+ adjusted_iter->original_offset += shift;
+ shift += currently_collapsing;
+ currently_collapsing = 0;
+ ++adjusted_iter;
+ } else if ((adjusted_iter->original_offset + shift) >
+ first_iter->original_offset) {
+ // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
+
+ // It's not possible for the adjustments to overlap. (It shouldn't
+ // be possible that we have an |adjusted_iter->original_offset| that,
+ // when adjusted by the computed |shift|, is in the middle of
+ // |first_iter|'s output's length. After all, that would mean the
+ // current adjustment_on_adjusted_string somehow points to an offset
+ // that was supposed to have been eliminated by the first set of
+ // adjustments.)
+ GURL_DCHECK_LE(first_iter->original_offset + first_iter->output_length,
+ adjusted_iter->original_offset + shift);
+
+ // Add the |first_adjustment_iter| to the full set of adjustments while
+ // making sure |adjusted_iter| continues pointing to the same element.
+ // We do this by inserting the |first_adjustment_iter| right before
+ // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
+ // the following element.
+ shift += first_iter->original_length - first_iter->output_length;
+ adjusted_iter = adjustments_on_adjusted_string->insert(
+ adjusted_iter, *first_iter);
+ ++adjusted_iter;
+ ++first_iter;
+ } else {
+ // The first adjustment adjusted something that then got further adjusted
+ // by the second set of adjustments. In other words, |first_iter| points
+ // to something in the range covered by |adjusted_iter|'s length (after
+ // accounting for |shift|). Precisely,
+ // adjusted_iter->original_offset + shift
+ // <=
+ // first_iter->original_offset
+ // <=
+ // adjusted_iter->original_offset + shift +
+ // adjusted_iter->original_length
+
+ // Modify the current |adjusted_iter| to include whatever collapsing
+ // happened in |first_iter|, then advance to the next |first_adjustments|
+ // because we dealt with the current one.
+ const int collapse = static_cast<int>(first_iter->original_length) -
+ static_cast<int>(first_iter->output_length);
+ // This function does not know how to deal with a string that expands and
+ // then gets modified, only strings that collapse and then get modified.
+ GURL_DCHECK_GT(collapse, 0);
+ adjusted_iter->original_length += collapse;
+ currently_collapsing += collapse;
+ ++first_iter;
+ }
+ }
+ GURL_DCHECK_EQ(0u, currently_collapsing);
+ if (first_iter != first_adjustments.end()) {
+ // Only first adjustments are left. These do not need to be modified.
+ // (Their offsets are already correct with respect to the original string.)
+ // Append them all.
+ GURL_DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
+ adjustments_on_adjusted_string->insert(
+ adjustments_on_adjusted_string->end(), first_iter,
+ first_adjustments.end());
+ }
+}
+
+// Converts the given source Unicode character type to the given destination
+// Unicode character type as a STL string. The given input buffer and size
+// determine the source, and the given output STL string will be replaced by
+// the result. If non-NULL, |adjustments| is set to reflect the all the
+// alterations to the string that are not one-character-to-one-character.
+// It will always be sorted by increasing offset.
+template<typename SrcChar, typename DestStdString>
+bool ConvertUnicode(const SrcChar* src,
+ size_t src_len,
+ DestStdString* output,
+ OffsetAdjuster::Adjustments* adjustments) {
+ if (adjustments)
+ adjustments->clear();
+ // ICU requires 32-bit numbers.
+ bool success = true;
+ int32_t src_len32 = static_cast<int32_t>(src_len);
+ for (int32_t i = 0; i < src_len32; i++) {
+ uint32_t code_point;
+ size_t original_i = i;
+ size_t chars_written = 0;
+ if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
+ chars_written = WriteUnicodeCharacter(code_point, output);
+ } else {
+ chars_written = WriteUnicodeCharacter(0xFFFD, output);
+ success = false;
+ }
+
+ // Only bother writing an adjustment if this modification changed the
+ // length of this character.
+ // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
+ // character read, not after it (so that incrementing it in the loop
+ // increment will place it at the right location), so we need to account
+ // for that in determining the amount that was read.
+ if (adjustments && ((i - original_i + 1) != chars_written)) {
+ adjustments->push_back(OffsetAdjuster::Adjustment(
+ original_i, i - original_i + 1, chars_written));
+ }
+ }
+ return success;
+}
+
+bool UTF8ToUTF16WithAdjustments(
+ const char* src,
+ size_t src_len,
+ string16* output,
+ gurl_base::OffsetAdjuster::Adjustments* adjustments) {
+ PrepareForUTF16Or32Output(src, src_len, output);
+ return ConvertUnicode(src, src_len, output, adjustments);
+}
+
+string16 UTF8ToUTF16WithAdjustments(
+ const gurl_base::StringPiece& utf8,
+ gurl_base::OffsetAdjuster::Adjustments* adjustments) {
+ string16 result;
+ UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
+ return result;
+}
+
+string16 UTF8ToUTF16AndAdjustOffsets(
+ const gurl_base::StringPiece& utf8,
+ std::vector<size_t>* offsets_for_adjustment) {
+ for (size_t& offset : *offsets_for_adjustment) {
+ if (offset > utf8.length())
+ offset = string16::npos;
+ }
+ OffsetAdjuster::Adjustments adjustments;
+ string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
+ OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
+ return result;
+}
+
+std::string UTF16ToUTF8AndAdjustOffsets(
+ const gurl_base::StringPiece16& utf16,
+ std::vector<size_t>* offsets_for_adjustment) {
+ for (size_t& offset : *offsets_for_adjustment) {
+ if (offset > utf16.length())
+ offset = string16::npos;
+ }
+ std::string result;
+ PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
+ OffsetAdjuster::Adjustments adjustments;
+ ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
+ OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
+ return result;
+}
+
+} // namespace base
diff --git a/base/strings/utf_offset_string_conversions.h b/base/strings/utf_offset_string_conversions.h
new file mode 100644
index 0000000..8902ee5
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
+
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+// A helper class and associated data structures to adjust offsets into a
+// string in response to various adjustments one might do to that string
+// (e.g., eliminating a range). For details on offsets, see the comments by
+// the AdjustOffsets() function below.
+class BASE_EXPORT OffsetAdjuster {
+ public:
+ struct BASE_EXPORT Adjustment {
+ Adjustment(size_t original_offset,
+ size_t original_length,
+ size_t output_length);
+
+ size_t original_offset;
+ size_t original_length;
+ size_t output_length;
+ };
+ typedef std::vector<Adjustment> Adjustments;
+
+ // Adjusts all offsets in |offsets_for_adjustment| to reflect the adjustments
+ // recorded in |adjustments|. Adjusted offsets greater than |limit| will be
+ // set to string16::npos.
+ //
+ // Offsets represents insertion/selection points between characters: if |src|
+ // is "abcd", then 0 is before 'a', 2 is between 'b' and 'c', and 4 is at the
+ // end of the string. Valid input offsets range from 0 to |src_len|. On
+ // exit, each offset will have been modified to point at the same logical
+ // position in the output string. If an offset cannot be successfully
+ // adjusted (e.g., because it points into the middle of a multibyte sequence),
+ // it will be set to string16::npos.
+ static void AdjustOffsets(const Adjustments& adjustments,
+ std::vector<size_t>* offsets_for_adjustment,
+ size_t limit = string16::npos);
+
+ // Adjusts the single |offset| to reflect the adjustments recorded in
+ // |adjustments|.
+ static void AdjustOffset(const Adjustments& adjustments,
+ size_t* offset,
+ size_t limit = string16::npos);
+
+ // Adjusts all offsets in |offsets_for_unadjustment| to reflect the reverse
+ // of the adjustments recorded in |adjustments|. In other words, the offsets
+ // provided represent offsets into an adjusted string and the caller wants
+ // to know the offsets they correspond to in the original string. If an
+ // offset cannot be successfully unadjusted (e.g., because it points into
+ // the middle of a multibyte sequence), it will be set to string16::npos.
+ static void UnadjustOffsets(const Adjustments& adjustments,
+ std::vector<size_t>* offsets_for_unadjustment);
+
+ // Adjusts the single |offset| to reflect the reverse of the adjustments
+ // recorded in |adjustments|.
+ static void UnadjustOffset(const Adjustments& adjustments,
+ size_t* offset);
+
+ // Combines two sequential sets of adjustments, storing the combined revised
+ // adjustments in |adjustments_on_adjusted_string|. That is, suppose a
+ // string was altered in some way, with the alterations recorded as
+ // adjustments in |first_adjustments|. Then suppose the resulting string is
+ // further altered, with the alterations recorded as adjustments scored in
+ // |adjustments_on_adjusted_string|, with the offsets recorded in these
+ // adjustments being with respect to the intermediate string. This function
+ // combines the two sets of adjustments into one, storing the result in
+ // |adjustments_on_adjusted_string|, whose offsets are correct with respect
+ // to the original string.
+ //
+ // Assumes both parameters are sorted by increasing offset.
+ //
+ // WARNING: Only supports |first_adjustments| that involve collapsing ranges
+ // of text, not expanding ranges.
+ static void MergeSequentialAdjustments(
+ const Adjustments& first_adjustments,
+ Adjustments* adjustments_on_adjusted_string);
+};
+
+// Like the conversions in utf_string_conversions.h, but also fills in an
+// |adjustments| parameter that reflects the alterations done to the string.
+// It may be NULL.
+BASE_EXPORT bool UTF8ToUTF16WithAdjustments(
+ const char* src,
+ size_t src_len,
+ string16* output,
+ gurl_base::OffsetAdjuster::Adjustments* adjustments);
+BASE_EXPORT string16 UTF8ToUTF16WithAdjustments(
+ const gurl_base::StringPiece& utf8,
+ gurl_base::OffsetAdjuster::Adjustments* adjustments);
+// As above, but instead internally examines the adjustments and applies them
+// to |offsets_for_adjustment|. Input offsets greater than the length of the
+// input string will be set to string16::npos. See comments by AdjustOffsets().
+BASE_EXPORT string16 UTF8ToUTF16AndAdjustOffsets(
+ const gurl_base::StringPiece& utf8,
+ std::vector<size_t>* offsets_for_adjustment);
+BASE_EXPORT std::string UTF16ToUTF8AndAdjustOffsets(
+ const gurl_base::StringPiece16& utf16,
+ std::vector<size_t>* offsets_for_adjustment);
+
+} // namespace base
+
+#endif // BASE_STRINGS_UTF_OFFSET_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_offset_string_conversions_unittest.cc b/base/strings/utf_offset_string_conversions_unittest.cc
new file mode 100644
index 0000000..4691cb3
--- /dev/null
+++ b/base/strings/utf_offset_string_conversions_unittest.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include <algorithm>
+
+#include "polyfills/base/logging.h"
+#include "base/stl_util.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/utf_offset_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+namespace {
+
+static const size_t kNpos = string16::npos;
+
+} // namespace
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
+ struct UTF8ToUTF16Case {
+ const char* utf8;
+ size_t input_offset;
+ size_t output_offset;
+ } utf8_to_utf16_cases[] = {
+ {"", 0, 0},
+ {"", kNpos, kNpos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
+ {"\xed\xb0\x80z", 3, 3},
+ {"A\xF0\x90\x8C\x80z", 1, 1},
+ {"A\xF0\x90\x8C\x80z", 2, kNpos},
+ {"A\xF0\x90\x8C\x80z", 5, 3},
+ {"A\xF0\x90\x8C\x80z", 6, 4},
+ {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
+ };
+ for (const auto& i : utf8_to_utf16_cases) {
+ const size_t offset = i.input_offset;
+ std::vector<size_t> offsets;
+ offsets.push_back(offset);
+ UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets);
+ EXPECT_EQ(i.output_offset, offsets[0]);
+ }
+
+ struct UTF16ToUTF8Case {
+ char16 utf16[10];
+ size_t input_offset;
+ size_t output_offset;
+ } utf16_to_utf8_cases[] = {
+ {{}, 0, 0},
+ // Converted to 3-byte utf-8 sequences
+ {{0x5909, 0x63DB}, 3, kNpos},
+ {{0x5909, 0x63DB}, 2, 6},
+ {{0x5909, 0x63DB}, 1, 3},
+ {{0x5909, 0x63DB}, 0, 0},
+ // Converted to 2-byte utf-8 sequences
+ {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
+ {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
+ {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
+ {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
+ // Surrogate pair
+ {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
+ {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
+ {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
+ {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
+ };
+ for (size_t i = 0; i < gurl_base::size(utf16_to_utf8_cases); ++i) {
+ size_t offset = utf16_to_utf8_cases[i].input_offset;
+ std::vector<size_t> offsets;
+ offsets.push_back(offset);
+ UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
+ EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
+ }
+}
+
+TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
+ const OffsetAdjuster::Adjustments kNoAdjustments;
+ const size_t kLimit = 10;
+ const size_t kItems = 20;
+ std::vector<size_t> size_ts;
+ for (size_t t = 0; t < kItems; ++t) {
+ size_ts.push_back(t);
+ OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
+ }
+ size_t unlimited_count = 0;
+ for (auto ti : size_ts) {
+ if (ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(11U, unlimited_count);
+
+ // Reverse the values in the vector and try again.
+ size_ts.clear();
+ for (size_t t = kItems; t > 0; --t) {
+ size_ts.push_back(t - 1);
+ OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
+ }
+ unlimited_count = 0;
+ for (auto ti : size_ts) {
+ if (ti != kNpos)
+ ++unlimited_count;
+ }
+ EXPECT_EQ(11U, unlimited_count);
+}
+
+TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
+ // Imagine we have strings as shown in the following cases where the
+ // X's represent encoded characters.
+ // 1: abcXXXdef ==> abcXdef
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 9; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
+ OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
+ size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_1));
+ for (size_t i = 0; i < gurl_base::size(expected_1); ++i)
+ EXPECT_EQ(expected_1[i], offsets[i]);
+ }
+
+ // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 23; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
+ adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
+ adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
+ adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
+ OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
+ size_t expected_2[] = {
+ 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
+ kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
+ };
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_2));
+ for (size_t i = 0; i < gurl_base::size(expected_2); ++i)
+ EXPECT_EQ(expected_2[i], offsets[i]);
+ }
+
+ // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 17; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
+ adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
+ adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
+ adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
+ OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
+ size_t expected_3[] = {
+ 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
+ 12, kNpos, 12
+ };
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_3));
+ for (size_t i = 0; i < gurl_base::size(expected_3); ++i)
+ EXPECT_EQ(expected_3[i], offsets[i]);
+ }
+}
+
+TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
+ // Imagine we have strings as shown in the following cases where the
+ // X's represent encoded characters.
+ // 1: abcXXXdef ==> abcXdef
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 7; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
+ OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
+ size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_1));
+ for (size_t i = 0; i < gurl_base::size(expected_1); ++i)
+ EXPECT_EQ(expected_1[i], offsets[i]);
+ }
+
+ // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 14; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
+ adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
+ adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
+ adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
+ OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
+ size_t expected_2[] = {
+ 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
+ };
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_2));
+ for (size_t i = 0; i < gurl_base::size(expected_2); ++i)
+ EXPECT_EQ(expected_2[i], offsets[i]);
+ }
+
+ // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
+ {
+ std::vector<size_t> offsets;
+ for (size_t t = 0; t <= 12; ++t)
+ offsets.push_back(t);
+ OffsetAdjuster::Adjustments adjustments;
+ adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
+ adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
+ adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
+ adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
+ OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
+ size_t expected_3[] = {
+ 0, // this could just as easily be 3
+ 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
+ 15 // this could just as easily be 17
+ };
+ EXPECT_EQ(offsets.size(), gurl_base::size(expected_3));
+ for (size_t i = 0; i < gurl_base::size(expected_3); ++i)
+ EXPECT_EQ(expected_3[i], offsets[i]);
+ }
+}
+
+// MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
+// net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
+// NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
+// is simply a short, additional test.
+TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
+ // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
+
+ // Set up |first_adjustments| to
+ // - remove the leading "a"
+ // - combine the "bc" into one character (call it ".")
+ // - remove the "f"
+ // - remove the "tuv"
+ // The resulting string should be ".deghijklmnopqrswxyz".
+ OffsetAdjuster::Adjustments first_adjustments;
+ first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
+ first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
+ first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
+ first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
+
+ // Set up |adjustments_on_adjusted_string| to
+ // - combine the "." character that replaced "bc" with "d" into one character
+ // (call it "?")
+ // - remove the "egh"
+ // - expand the "i" into two characters (call them "12")
+ // - combine the "jkl" into one character (call it "@")
+ // - expand the "z" into two characters (call it "34")
+ // The resulting string should be "?12@mnopqrswxy34".
+ OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
+ adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
+ 0, 2, 1));
+ adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
+ 2, 3, 0));
+ adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
+ 5, 1, 2));
+ adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
+ 6, 3, 1));
+ adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
+ 19, 1, 2));
+
+ // Now merge the adjustments and check the results.
+ OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
+ &adjustments_on_adjusted_string);
+ // The merged adjustments should look like
+ // - combine abcd into "?"
+ // - note: it's also reasonable for the Merge function to instead produce
+ // two adjustments instead of this, one to remove a and another to
+ // combine bcd into "?". This test verifies the current behavior.
+ // - remove efgh
+ // - expand i into "12"
+ // - combine jkl into "@"
+ // - remove tuv
+ // - expand z into "34"
+ ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
+ EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
+ EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
+ EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
+ EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
+ EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
+ EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
+ EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
+ EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
+ EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
+ EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
+ EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
+ EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
+ EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
+ EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
+ EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
+ EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
+ EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
+ EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
+}
+
+} // namespace base
diff --git a/base/strings/utf_string_conversion_utils.cc b/base/strings/utf_string_conversion_utils.cc
new file mode 100644
index 0000000..ce432e7
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_string_conversion_utils.h"
+
+#include "base/third_party/icu/icu_utf.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+bool ReadUnicodeCharacter(const char* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point_out) {
+ // U8_NEXT expects to be able to use -1 to signal an error, so we must
+ // use a signed type for code_point. But this function returns false
+ // on error anyway, so code_point_out is unsigned.
+ int32_t code_point;
+ CBU8_NEXT(src, *char_index, src_len, code_point);
+ *code_point_out = static_cast<uint32_t>(code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*char_index)--;
+
+ // Validate the decoded value.
+ return IsValidCodepoint(code_point);
+}
+
+bool ReadUnicodeCharacter(const char16* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point) {
+ if (CBU16_IS_SURROGATE(src[*char_index])) {
+ if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
+ *char_index + 1 >= src_len ||
+ !CBU16_IS_TRAIL(src[*char_index + 1])) {
+ // Invalid surrogate pair.
+ return false;
+ }
+
+ // Valid surrogate pair.
+ *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
+ src[*char_index + 1]);
+ (*char_index)++;
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = src[*char_index];
+ }
+
+ return IsValidCodepoint(*code_point);
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+bool ReadUnicodeCharacter(const wchar_t* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point) {
+ // Conversion is easy since the source is 32-bit.
+ *code_point = src[*char_index];
+
+ // Validate the value.
+ return IsValidCodepoint(*code_point);
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
+ if (code_point <= 0x7f) {
+ // Fast path the common case of one byte.
+ output->push_back(static_cast<char>(code_point));
+ return 1;
+ }
+
+
+ // CBU8_APPEND_UNSAFE can append up to 4 bytes.
+ size_t char_offset = output->length();
+ size_t original_char_offset = char_offset;
+ output->resize(char_offset + CBU8_MAX_LENGTH);
+
+ CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+
+ // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
+ // it will represent the new length of the string.
+ output->resize(char_offset);
+ return char_offset - original_char_offset;
+}
+
+size_t WriteUnicodeCharacter(uint32_t code_point, string16* output) {
+ if (CBU16_LENGTH(code_point) == 1) {
+ // Thie code point is in the Basic Multilingual Plane (BMP).
+ output->push_back(static_cast<char16>(code_point));
+ return 1;
+ }
+ // Non-BMP characters use a double-character encoding.
+ size_t char_offset = output->length();
+ output->resize(char_offset + CBU16_MAX_LENGTH);
+ CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
+ return CBU16_MAX_LENGTH;
+}
+
+// Generalized Unicode converter -----------------------------------------------
+
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src,
+ size_t src_len,
+ std::string* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (src[0] < 0x80) {
+ // Assume that the entire input will be ASCII.
+ output->reserve(src_len);
+ } else {
+ // Assume that the entire input is non-ASCII and will have 3 bytes per char.
+ output->reserve(src_len * 3);
+ }
+}
+
+// Instantiate versions we know callers will need.
+#if !defined(OS_WIN)
+// wchar_t and char16 are the same thing on Windows.
+template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
+#endif
+template void PrepareForUTF8Output(const char16*, size_t, std::string*);
+
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src,
+ size_t src_len,
+ STRING* output) {
+ output->clear();
+ if (src_len == 0)
+ return;
+ if (static_cast<unsigned char>(src[0]) < 0x80) {
+ // Assume the input is all ASCII, which means 1:1 correspondence.
+ output->reserve(src_len);
+ } else {
+ // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
+ // character.
+ output->reserve(src_len / 2);
+ }
+}
+
+// Instantiate versions we know callers will need.
+#if !defined(OS_WIN)
+// std::wstring and string16 are the same thing on Windows.
+template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
+#endif
+template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
+
+} // namespace base
diff --git a/base/strings/utf_string_conversion_utils.h b/base/strings/utf_string_conversion_utils.h
new file mode 100644
index 0000000..84d18f7
--- /dev/null
+++ b/base/strings/utf_string_conversion_utils.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+#define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
+
+// Low-level UTF handling functions. Most code will want to use the functions
+// in utf_string_conversions.h
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+
+namespace gurl_base {
+
+inline bool IsValidCodepoint(uint32_t code_point) {
+ // Excludes the surrogate code points ([0xD800, 0xDFFF]) and
+ // codepoints larger than 0x10FFFF (the highest codepoint allowed).
+ // Non-characters and unassigned codepoints are allowed.
+ return code_point < 0xD800u ||
+ (code_point >= 0xE000u && code_point <= 0x10FFFFu);
+}
+
+inline bool IsValidCharacter(uint32_t code_point) {
+ // Excludes non-characters (U+FDD0..U+FDEF, and all codepoints ending in
+ // 0xFFFE or 0xFFFF) from the set of valid code points.
+ return code_point < 0xD800u || (code_point >= 0xE000u &&
+ code_point < 0xFDD0u) || (code_point > 0xFDEFu &&
+ code_point <= 0x10FFFFu && (code_point & 0xFFFEu) != 0xFFFEu);
+}
+
+// ReadUnicodeCharacter --------------------------------------------------------
+
+// Reads a UTF-8 stream, placing the next code point into the given output
+// |*code_point|. |src| represents the entire string to read, and |*char_index|
+// is the character offset within the string to start reading at. |*char_index|
+// will be updated to index the last character read, such that incrementing it
+// (as in a for loop) will take the reader to the next character.
+//
+// Returns true on success. On false, |*code_point| will be invalid.
+BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point_out);
+
+// Reads a UTF-16 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const char16* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Reads UTF-32 character. The usage is the same as the 8-bit version above.
+BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
+ int32_t src_len,
+ int32_t* char_index,
+ uint32_t* code_point);
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// WriteUnicodeCharacter -------------------------------------------------------
+
+// Appends a UTF-8 character to the given 8-bit string. Returns the number of
+// bytes written.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point,
+ std::string* output);
+
+// Appends the given code point as a UTF-16 character to the given 16-bit
+// string. Returns the number of 16-bit values written.
+BASE_EXPORT size_t WriteUnicodeCharacter(uint32_t code_point, string16* output);
+
+#if defined(WCHAR_T_IS_UTF32)
+// Appends the given UTF-32 character to the given 32-bit string. Returns the
+// number of 32-bit values written.
+inline size_t WriteUnicodeCharacter(uint32_t code_point, std::wstring* output) {
+ // This is the easy case, just append the character.
+ output->push_back(code_point);
+ return 1;
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// Generalized Unicode converter -----------------------------------------------
+
+// Guesses the length of the output in UTF-8 in bytes, clears that output
+// string, and reserves that amount of space. We assume that the input
+// character types are unsigned, which will be true for UTF-16 and -32 on our
+// systems.
+template<typename CHAR>
+void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
+
+// Prepares an output buffer (containing either UTF-16 or -32 data) given some
+// UTF-8 input that will be converted to it. See PrepareForUTF8Output().
+template<typename STRING>
+void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
+
+} // namespace base
+
+#endif // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
diff --git a/base/strings/utf_string_conversions.cc b/base/strings/utf_string_conversions.cc
new file mode 100644
index 0000000..aaf4a40
--- /dev/null
+++ b/base/strings/utf_string_conversions.cc
@@ -0,0 +1,342 @@
+// Copyright (c) 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/utf_string_conversions.h"
+
+#include <limits.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversion_utils.h"
+#include "base/third_party/icu/icu_utf.h"
+#include "build/build_config.h"
+
+namespace gurl_base {
+
+namespace {
+
+constexpr int32_t kErrorCodePoint = 0xFFFD;
+
+// Size coefficient ----------------------------------------------------------
+// The maximum number of codeunits in the destination encoding corresponding to
+// one codeunit in the source encoding.
+
+template <typename SrcChar, typename DestChar>
+struct SizeCoefficient {
+ static_assert(sizeof(SrcChar) < sizeof(DestChar),
+ "Default case: from a smaller encoding to the bigger one");
+
+ // ASCII symbols are encoded by one codeunit in all encodings.
+ static constexpr int value = 1;
+};
+
+template <>
+struct SizeCoefficient<char16, char> {
+ // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
+ static constexpr int value = 3;
+};
+
+#if defined(WCHAR_T_IS_UTF32)
+template <>
+struct SizeCoefficient<wchar_t, char> {
+ // UTF-8 uses at most 4 codeunits per character.
+ static constexpr int value = 4;
+};
+
+template <>
+struct SizeCoefficient<wchar_t, char16> {
+ // UTF-16 uses at most 2 codeunits per character.
+ static constexpr int value = 2;
+};
+#endif // defined(WCHAR_T_IS_UTF32)
+
+template <typename SrcChar, typename DestChar>
+constexpr int size_coefficient_v =
+ SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
+
+// UnicodeAppendUnsafe --------------------------------------------------------
+// Function overloads that write code_point to the output string. Output string
+// has to have enough space for the codepoint.
+
+// Convenience typedef that checks whether the passed in type is integral (i.e.
+// bool, char, int or their extended versions) and is of the correct size.
+template <typename Char, size_t N>
+using EnableIfBitsAre = std::enable_if_t<std::is_integral<Char>::value &&
+ CHAR_BIT * sizeof(Char) == N,
+ bool>;
+
+template <typename Char, EnableIfBitsAre<Char, 8> = true>
+void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
+ CBU8_APPEND_UNSAFE(out, *size, code_point);
+}
+
+template <typename Char, EnableIfBitsAre<Char, 16> = true>
+void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
+ CBU16_APPEND_UNSAFE(out, *size, code_point);
+}
+
+template <typename Char, EnableIfBitsAre<Char, 32> = true>
+void UnicodeAppendUnsafe(Char* out, int32_t* size, uint32_t code_point) {
+ out[(*size)++] = code_point;
+}
+
+// DoUTFConversion ------------------------------------------------------------
+// Main driver of UTFConversion specialized for different Src encodings.
+// dest has to have enough room for the converted text.
+
+template <typename DestChar>
+bool DoUTFConversion(const char* src,
+ int32_t src_len,
+ DestChar* dest,
+ int32_t* dest_len) {
+ bool success = true;
+
+ for (int32_t i = 0; i < src_len;) {
+ int32_t code_point;
+ CBU8_NEXT(src, i, src_len, code_point);
+
+ if (!IsValidCodepoint(code_point)) {
+ success = false;
+ code_point = kErrorCodePoint;
+ }
+
+ UnicodeAppendUnsafe(dest, dest_len, code_point);
+ }
+
+ return success;
+}
+
+template <typename DestChar>
+bool DoUTFConversion(const char16* src,
+ int32_t src_len,
+ DestChar* dest,
+ int32_t* dest_len) {
+ bool success = true;
+
+ auto ConvertSingleChar = [&success](char16 in) -> int32_t {
+ if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
+ success = false;
+ return kErrorCodePoint;
+ }
+ return in;
+ };
+
+ int32_t i = 0;
+
+ // Always have another symbol in order to avoid checking boundaries in the
+ // middle of the surrogate pair.
+ while (i < src_len - 1) {
+ int32_t code_point;
+
+ if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
+ code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
+ if (!IsValidCodepoint(code_point)) {
+ code_point = kErrorCodePoint;
+ success = false;
+ }
+ i += 2;
+ } else {
+ code_point = ConvertSingleChar(src[i]);
+ ++i;
+ }
+
+ UnicodeAppendUnsafe(dest, dest_len, code_point);
+ }
+
+ if (i < src_len)
+ UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
+
+ return success;
+}
+
+#if defined(WCHAR_T_IS_UTF32)
+
+template <typename DestChar>
+bool DoUTFConversion(const wchar_t* src,
+ int32_t src_len,
+ DestChar* dest,
+ int32_t* dest_len) {
+ bool success = true;
+
+ for (int32_t i = 0; i < src_len; ++i) {
+ int32_t code_point = src[i];
+
+ if (!IsValidCodepoint(code_point)) {
+ success = false;
+ code_point = kErrorCodePoint;
+ }
+
+ UnicodeAppendUnsafe(dest, dest_len, code_point);
+ }
+
+ return success;
+}
+
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// UTFConversion --------------------------------------------------------------
+// Function template for generating all UTF conversions.
+
+template <typename InputString, typename DestString>
+bool UTFConversion(const InputString& src_str, DestString* dest_str) {
+ if (IsStringASCII(src_str)) {
+ dest_str->assign(src_str.begin(), src_str.end());
+ return true;
+ }
+
+ dest_str->resize(src_str.length() *
+ size_coefficient_v<typename InputString::value_type,
+ typename DestString::value_type>);
+
+ // Empty string is ASCII => it OK to call operator[].
+ auto* dest = &(*dest_str)[0];
+
+ // ICU requires 32 bit numbers.
+ int32_t src_len32 = static_cast<int32_t>(src_str.length());
+ int32_t dest_len32 = 0;
+
+ bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
+
+ dest_str->resize(dest_len32);
+ dest_str->shrink_to_fit();
+
+ return res;
+}
+
+} // namespace
+
+// UTF16 <-> UTF8 --------------------------------------------------------------
+
+bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
+ return UTFConversion(StringPiece(src, src_len), output);
+}
+
+string16 UTF8ToUTF16(StringPiece utf8) {
+ string16 ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
+ return ret;
+}
+
+bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
+ return UTFConversion(StringPiece16(src, src_len), output);
+}
+
+std::string UTF16ToUTF8(StringPiece16 utf16) {
+ std::string ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
+ return ret;
+}
+
+// UTF-16 <-> Wide -------------------------------------------------------------
+
+#if defined(WCHAR_T_IS_UTF16)
+// When wide == UTF-16 the conversions are a NOP.
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
+ output->assign(src, src + src_len);
+ return true;
+}
+
+string16 WideToUTF16(WStringPiece wide) {
+ return string16(wide.begin(), wide.end());
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+ output->assign(src, src + src_len);
+ return true;
+}
+
+std::wstring UTF16ToWide(StringPiece16 utf16) {
+ return std::wstring(utf16.begin(), utf16.end());
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
+ return UTFConversion(gurl_base::WStringPiece(src, src_len), output);
+}
+
+string16 WideToUTF16(WStringPiece wide) {
+ string16 ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ WideToUTF16(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
+ return UTFConversion(StringPiece16(src, src_len), output);
+}
+
+std::wstring UTF16ToWide(StringPiece16 utf16) {
+ std::wstring ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF16ToWide(utf16.data(), utf16.length(), &ret);
+ return ret;
+}
+
+#endif // defined(WCHAR_T_IS_UTF32)
+
+// UTF-8 <-> Wide --------------------------------------------------------------
+
+// UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
+
+bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
+ return UTFConversion(StringPiece(src, src_len), output);
+}
+
+std::wstring UTF8ToWide(StringPiece utf8) {
+ std::wstring ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ UTF8ToWide(utf8.data(), utf8.length(), &ret);
+ return ret;
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+// Easy case since we can use the "utf" versions we already wrote above.
+
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
+ return UTF16ToUTF8(as_u16cstr(src), src_len, output);
+}
+
+std::string WideToUTF8(WStringPiece wide) {
+ return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+
+bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
+ return UTFConversion(WStringPiece(src, src_len), output);
+}
+
+std::string WideToUTF8(WStringPiece wide) {
+ std::string ret;
+ // Ignore the success flag of this call, it will do the best it can for
+ // invalid input, which is what we want here.
+ WideToUTF8(wide.data(), wide.length(), &ret);
+ return ret;
+}
+
+#endif // defined(WCHAR_T_IS_UTF32)
+
+string16 ASCIIToUTF16(StringPiece ascii) {
+ GURL_DCHECK(IsStringASCII(ascii)) << ascii;
+ return string16(ascii.begin(), ascii.end());
+}
+
+std::string UTF16ToASCII(StringPiece16 utf16) {
+ GURL_DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
+ return std::string(utf16.begin(), utf16.end());
+}
+
+} // namespace base
diff --git a/base/strings/utf_string_conversions.h b/base/strings/utf_string_conversions.h
new file mode 100644
index 0000000..e64f420
--- /dev/null
+++ b/base/strings/utf_string_conversions.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_UTF_STRING_CONVERSIONS_H_
+#define BASE_STRINGS_UTF_STRING_CONVERSIONS_H_
+
+#include <stddef.h>
+
+#include <string>
+
+#include "polyfills/base/base_export.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+
+namespace gurl_base {
+
+// These convert between UTF-8, -16, and -32 strings. They are potentially slow,
+// so avoid unnecessary conversions. The low-level versions return a boolean
+// indicating whether the conversion was 100% valid. In this case, it will still
+// do the best it can and put the result in the output buffer. The versions that
+// return strings ignore this error and just return the best conversion
+// possible.
+BASE_EXPORT bool WideToUTF8(const wchar_t* src, size_t src_len,
+ std::string* output);
+BASE_EXPORT std::string WideToUTF8(WStringPiece wide);
+BASE_EXPORT bool UTF8ToWide(const char* src, size_t src_len,
+ std::wstring* output);
+BASE_EXPORT std::wstring UTF8ToWide(StringPiece utf8);
+
+BASE_EXPORT bool WideToUTF16(const wchar_t* src, size_t src_len,
+ string16* output);
+BASE_EXPORT string16 WideToUTF16(WStringPiece wide);
+BASE_EXPORT bool UTF16ToWide(const char16* src, size_t src_len,
+ std::wstring* output);
+BASE_EXPORT std::wstring UTF16ToWide(StringPiece16 utf16);
+
+BASE_EXPORT bool UTF8ToUTF16(const char* src, size_t src_len, string16* output);
+BASE_EXPORT string16 UTF8ToUTF16(StringPiece utf8);
+BASE_EXPORT bool UTF16ToUTF8(const char16* src, size_t src_len,
+ std::string* output);
+BASE_EXPORT std::string UTF16ToUTF8(StringPiece16 utf16);
+
+// This converts an ASCII string, typically a hardcoded constant, to a UTF16
+// string.
+BASE_EXPORT string16 ASCIIToUTF16(StringPiece ascii);
+
+// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
+// beforehand.
+BASE_EXPORT std::string UTF16ToASCII(StringPiece16 utf16);
+
+} // namespace base
+
+#endif // BASE_STRINGS_UTF_STRING_CONVERSIONS_H_
diff --git a/base/strings/utf_string_conversions_fuzzer.cc b/base/strings/utf_string_conversions_fuzzer.cc
new file mode 100644
index 0000000..96bccda
--- /dev/null
+++ b/base/strings/utf_string_conversions_fuzzer.cc
@@ -0,0 +1,56 @@
+// Copyright 2018 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+
+std::string output_std_string;
+std::wstring output_std_wstring;
+gurl_base::string16 output_string16;
+
+// Entry point for LibFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data),
+ size);
+
+ gurl_base::UTF8ToWide(string_piece_input);
+ gurl_base::UTF8ToWide(reinterpret_cast<const char*>(data), size,
+ &output_std_wstring);
+ gurl_base::UTF8ToUTF16(string_piece_input);
+ gurl_base::UTF8ToUTF16(reinterpret_cast<const char*>(data), size,
+ &output_string16);
+
+ // Test for char16.
+ if (size % 2 == 0) {
+ gurl_base::StringPiece16 string_piece_input16(
+ reinterpret_cast<const gurl_base::char16*>(data), size / 2);
+ gurl_base::UTF16ToWide(output_string16);
+ gurl_base::UTF16ToWide(reinterpret_cast<const gurl_base::char16*>(data), size / 2,
+ &output_std_wstring);
+ gurl_base::UTF16ToUTF8(string_piece_input16);
+ gurl_base::UTF16ToUTF8(reinterpret_cast<const gurl_base::char16*>(data), size / 2,
+ &output_std_string);
+ }
+
+ // Test for wchar_t.
+ size_t wchar_t_size = sizeof(wchar_t);
+ if (size % wchar_t_size == 0) {
+ gurl_base::WideToUTF8(output_std_wstring);
+ gurl_base::WideToUTF8(reinterpret_cast<const wchar_t*>(data),
+ size / wchar_t_size, &output_std_string);
+ gurl_base::WideToUTF16(output_std_wstring);
+ gurl_base::WideToUTF16(reinterpret_cast<const wchar_t*>(data),
+ size / wchar_t_size, &output_string16);
+ }
+
+ // Test for ASCII. This condition is needed to avoid hitting instant GURL_CHECK
+ // failures.
+ if (gurl_base::IsStringASCII(string_piece_input)) {
+ output_string16 = gurl_base::ASCIIToUTF16(string_piece_input);
+ gurl_base::StringPiece16 string_piece_input16(output_string16);
+ gurl_base::UTF16ToASCII(string_piece_input16);
+ }
+
+ return 0;
+}
diff --git a/base/strings/utf_string_conversions_unittest.cc b/base/strings/utf_string_conversions_unittest.cc
new file mode 100644
index 0000000..22b167b
--- /dev/null
+++ b/base/strings/utf_string_conversions_unittest.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include "polyfills/base/logging.h"
+#include "base/stl_util.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "build/build_config.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace gurl_base {
+
+namespace {
+
+const wchar_t* const kConvertRoundtripCases[] = {
+ L"Google Video",
+ // "网页 图片 资讯更多 »"
+ L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
+ // "Παγκόσμιος Ιστός"
+ L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
+ L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
+ // "Поиск страниц на русском"
+ L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
+ L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
+ L"\x0020\x0440\x0443\x0441\x0441\x043a\x043e\x043c",
+ // "전체서비스"
+ L"\xc804\xccb4\xc11c\xbe44\xc2a4",
+
+ // Test characters that take more than 16 bits. This will depend on whether
+ // wchar_t is 16 or 32 bits.
+#if defined(WCHAR_T_IS_UTF16)
+ L"\xd800\xdf00",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\xd807\xdd40\xd807\xdd41\xd807\xdd42\xd807\xdd43\xd807\xdd44",
+#elif defined(WCHAR_T_IS_UTF32)
+ L"\x10300",
+ // ????? (Mathematical Alphanumeric Symbols (U+011d40 - U+011d44 : A,B,C,D,E)
+ L"\x11d40\x11d41\x11d42\x11d43\x11d44",
+#endif
+};
+
+} // namespace
+
+TEST(UTFStringConversionsTest, ConvertUTF8AndWide) {
+ // we round-trip all the wide strings through UTF-8 to make sure everything
+ // agrees on the conversion. This uses the stream operators to test them
+ // simultaneously.
+ for (auto* i : kConvertRoundtripCases) {
+ std::ostringstream utf8;
+ utf8 << WideToUTF8(i);
+ std::wostringstream wide;
+ wide << UTF8ToWide(utf8.str());
+
+ EXPECT_EQ(i, wide.str());
+ }
+}
+
+TEST(UTFStringConversionsTest, ConvertUTF8AndWideEmptyString) {
+ // An empty std::wstring should be converted to an empty std::string,
+ // and vice versa.
+ std::wstring wempty;
+ std::string empty;
+ EXPECT_EQ(empty, WideToUTF8(wempty));
+ EXPECT_EQ(wempty, UTF8ToWide(empty));
+}
+
+TEST(UTFStringConversionsTest, ConvertUTF8ToWide) {
+ struct UTF8ToWideCase {
+ const char* utf8;
+ const wchar_t* wide;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-8 input.
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true},
+ // Non-character is passed through.
+ {"\xef\xbf\xbfHello", L"\xffffHello", true},
+ // Truncated UTF-8 sequence.
+ {"\xe4\xa0\xe5\xa5\xbd", L"\xfffd\x597d", false},
+ // Truncated off the end.
+ {"\xe5\xa5\xbd\xe4\xa0", L"\x597d\xfffd", false},
+ // Non-shortest-form UTF-8.
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", L"\xfffd\xfffd\xfffd\xfffd\x597d", false},
+ // This UTF-8 character decodes to a UTF-16 surrogate, which is illegal.
+ {"\xed\xb0\x80", L"\xfffd\xfffd\xfffd", false},
+ // Non-BMP characters. The second is a non-character regarded as valid.
+ // The result will either be in UTF-16 or UTF-32.
+#if defined(WCHAR_T_IS_UTF16)
+ {"A\xF0\x90\x8C\x80z", L"A\xd800\xdf00z", true},
+ {"A\xF4\x8F\xBF\xBEz", L"A\xdbff\xdffez", true},
+#elif defined(WCHAR_T_IS_UTF32)
+ {"A\xF0\x90\x8C\x80z", L"A\x10300z", true},
+ {"A\xF4\x8F\xBF\xBEz", L"A\x10fffez", true},
+#endif
+ };
+
+ for (const auto& i : convert_cases) {
+ std::wstring converted;
+ EXPECT_EQ(i.success, UTF8ToWide(i.utf8, strlen(i.utf8), &converted));
+ std::wstring expected(i.wide);
+ EXPECT_EQ(expected, converted);
+ }
+
+ // Manually test an embedded NULL.
+ std::wstring converted;
+ EXPECT_TRUE(UTF8ToWide("\00Z\t", 3, &converted));
+ ASSERT_EQ(3U, converted.length());
+ EXPECT_EQ(static_cast<wchar_t>(0), converted[0]);
+ EXPECT_EQ('Z', converted[1]);
+ EXPECT_EQ('\t', converted[2]);
+
+ // Make sure that conversion replaces, not appends.
+ EXPECT_TRUE(UTF8ToWide("B", 1, &converted));
+ ASSERT_EQ(1U, converted.length());
+ EXPECT_EQ('B', converted[0]);
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+// This test is only valid when wchar_t == UTF-16.
+TEST(UTFStringConversionsTest, ConvertUTF16ToUTF8) {
+ struct WideToUTF8Case {
+ const wchar_t* utf16;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular UTF-16 input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"\xd800\xdf00", "\xF0\x90\x8C\x80", true},
+ // Non-characters are passed through.
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+ {L"\xdbff\xdffeHello", "\xF4\x8F\xBF\xBEHello", true},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
+ // Truncated at the end.
+ {L"\x597d\xd800", "\xe5\xa5\xbd\xef\xbf\xbd", false},
+ };
+
+ for (const auto& test : convert_cases) {
+ std::string converted;
+ EXPECT_EQ(test.success,
+ WideToUTF8(test.utf16, wcslen(test.utf16), &converted));
+ std::string expected(test.utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+
+#elif defined(WCHAR_T_IS_UTF32)
+// This test is only valid when wchar_t == UTF-32.
+TEST(UTFStringConversionsTest, ConvertUTF32ToUTF8) {
+ struct WideToUTF8Case {
+ const wchar_t* utf32;
+ const char* utf8;
+ bool success;
+ } convert_cases[] = {
+ // Regular 16-bit input.
+ {L"\x4f60\x597d", "\xe4\xbd\xa0\xe5\xa5\xbd", true},
+ // Test a non-BMP character.
+ {L"A\x10300z", "A\xF0\x90\x8C\x80z", true},
+ // Non-characters are passed through.
+ {L"\xffffHello", "\xEF\xBF\xBFHello", true},
+ {L"\x10fffeHello", "\xF4\x8F\xBF\xBEHello", true},
+ // Invalid Unicode code points.
+ {L"\xfffffffHello", "\xEF\xBF\xBDHello", false},
+ // The first character is a truncated UTF-16 character.
+ {L"\xd800\x597d", "\xef\xbf\xbd\xe5\xa5\xbd", false},
+ {L"\xdc01Hello", "\xef\xbf\xbdHello", false},
+ };
+
+ for (const auto& test : convert_cases) {
+ std::string converted;
+ EXPECT_EQ(test.success,
+ WideToUTF8(test.utf32, wcslen(test.utf32), &converted));
+ std::string expected(test.utf8);
+ EXPECT_EQ(expected, converted);
+ }
+}
+#endif // defined(WCHAR_T_IS_UTF32)
+
+TEST(UTFStringConversionsTest, ConvertMultiString) {
+ static char16 multi16[] = {
+ 'f', 'o', 'o', '\0',
+ 'b', 'a', 'r', '\0',
+ 'b', 'a', 'z', '\0',
+ '\0'
+ };
+ static char multi[] = {
+ 'f', 'o', 'o', '\0',
+ 'b', 'a', 'r', '\0',
+ 'b', 'a', 'z', '\0',
+ '\0'
+ };
+ string16 multistring16;
+ memcpy(WriteInto(&multistring16, gurl_base::size(multi16)), multi16,
+ sizeof(multi16));
+ EXPECT_EQ(gurl_base::size(multi16) - 1, multistring16.length());
+ std::string expected;
+ memcpy(WriteInto(&expected, gurl_base::size(multi)), multi, sizeof(multi));
+ EXPECT_EQ(gurl_base::size(multi) - 1, expected.length());
+ const std::string& converted = UTF16ToUTF8(multistring16);
+ EXPECT_EQ(gurl_base::size(multi) - 1, converted.length());
+ EXPECT_EQ(expected, converted);
+}
+
+} // namespace base
diff --git a/base/template_util.h b/base/template_util.h
new file mode 100644
index 0000000..5384355
--- /dev/null
+++ b/base/template_util.h
@@ -0,0 +1,188 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_TEMPLATE_UTIL_H_
+#define BASE_TEMPLATE_UTIL_H_
+
+#include <stddef.h>
+#include <iosfwd>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "build/build_config.h"
+
+// Some versions of libstdc++ have partial support for type_traits, but misses
+// a smaller subset while removing some of the older non-standard stuff. Assume
+// that all versions below 5.0 fall in this category, along with one 5.0
+// experimental release. Test for this by consulting compiler major version,
+// the only reliable option available, so theoretically this could fail should
+// you attempt to mix an earlier version of libstdc++ with >= GCC5. But
+// that's unlikely to work out, especially as GCC5 changed ABI.
+#define CR_GLIBCXX_5_0_0 20150123
+#if (defined(__GNUC__) && __GNUC__ < 5) || \
+ (defined(__GLIBCXX__) && __GLIBCXX__ == CR_GLIBCXX_5_0_0)
+#define CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX
+#endif
+
+// This hacks around using gcc with libc++ which has some incompatibilies.
+// - is_trivially_* doesn't work: https://llvm.org/bugs/show_bug.cgi?id=27538
+// TODO(danakj): Remove this when android builders are all using a newer version
+// of gcc, or the android ndk is updated to a newer libc++ that works with older
+// gcc versions.
+#if !defined(__clang__) && defined(_LIBCPP_VERSION)
+#define CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX
+#endif
+
+namespace gurl_base {
+
+template <class T> struct is_non_const_reference : std::false_type {};
+template <class T> struct is_non_const_reference<T&> : std::true_type {};
+template <class T> struct is_non_const_reference<const T&> : std::false_type {};
+
+namespace internal {
+
+// Implementation detail of gurl_base::void_t below.
+template <typename...>
+struct make_void {
+ using type = void;
+};
+
+} // namespace internal
+
+// gurl_base::void_t is an implementation of std::void_t from C++17.
+//
+// We use |gurl_base::internal::make_void| as a helper struct to avoid a C++14
+// defect:
+// http://en.cppreference.com/w/cpp/types/void_t
+// http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558
+template <typename... Ts>
+using void_t = typename ::gurl_base::internal::make_void<Ts...>::type;
+
+namespace internal {
+
+// Uses expression SFINAE to detect whether using operator<< would work.
+template <typename T, typename = void>
+struct SupportsOstreamOperator : std::false_type {};
+template <typename T>
+struct SupportsOstreamOperator<T,
+ decltype(void(std::declval<std::ostream&>()
+ << std::declval<T>()))>
+ : std::true_type {};
+
+template <typename T, typename = void>
+struct SupportsToString : std::false_type {};
+template <typename T>
+struct SupportsToString<T, decltype(void(std::declval<T>().ToString()))>
+ : std::true_type {};
+
+// Used to detech whether the given type is an iterator. This is normally used
+// with std::enable_if to provide disambiguation for functions that take
+// templatzed iterators as input.
+template <typename T, typename = void>
+struct is_iterator : std::false_type {};
+
+template <typename T>
+struct is_iterator<T,
+ void_t<typename std::iterator_traits<T>::iterator_category>>
+ : std::true_type {};
+
+} // namespace internal
+
+// is_trivially_copyable is especially hard to get right.
+// - Older versions of libstdc++ will fail to have it like they do for other
+// type traits. This has become a subset of the second point, but used to be
+// handled independently.
+// - An experimental release of gcc includes most of type_traits but misses
+// is_trivially_copyable, so we still have to avoid using libstdc++ in this
+// case, which is covered by CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX.
+// - When compiling libc++ from before r239653, with a gcc compiler, the
+// std::is_trivially_copyable can fail. So we need to work around that by not
+// using the one in libc++ in this case. This is covered by the
+// CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX define, and is discussed in
+// https://llvm.org/bugs/show_bug.cgi?id=27538#c1 where they point out that
+// in libc++'s commit r239653 this is fixed by libc++ checking for gcc 5.1.
+// - In both of the above cases we are using the gcc compiler. When defining
+// this ourselves on compiler intrinsics, the __is_trivially_copyable()
+// intrinsic is not available on gcc before version 5.1 (see the discussion in
+// https://llvm.org/bugs/show_bug.cgi?id=27538#c1 again), so we must check for
+// that version.
+// - When __is_trivially_copyable() is not available because we are on gcc older
+// than 5.1, we need to fall back to something, so we use __has_trivial_copy()
+// instead based on what was done one-off in bit_cast() previously.
+
+// TODO(crbug.com/554293): Remove this when all platforms have this in the std
+// namespace and it works with gcc as needed.
+#if defined(CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX) || \
+ defined(CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX)
+template <typename T>
+struct is_trivially_copyable {
+// TODO(danakj): Remove this when android builders are all using a newer version
+// of gcc, or the android ndk is updated to a newer libc++ that does this for
+// us.
+#if _GNUC_VER >= 501
+ static constexpr bool value = __is_trivially_copyable(T);
+#else
+ static constexpr bool value =
+ __has_trivial_copy(T) && __has_trivial_destructor(T);
+#endif
+};
+#else
+template <class T>
+using is_trivially_copyable = std::is_trivially_copyable<T>;
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 7
+// Workaround for g++7 and earlier family.
+// Due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80654, without this
+// Optional<std::vector<T>> where T is non-copyable causes a compile error.
+// As we know it is not trivially copy constructible, explicitly declare so.
+template <typename T>
+struct is_trivially_copy_constructible
+ : std::is_trivially_copy_constructible<T> {};
+
+template <typename... T>
+struct is_trivially_copy_constructible<std::vector<T...>> : std::false_type {};
+#else
+// Otherwise use std::is_trivially_copy_constructible as is.
+template <typename T>
+using is_trivially_copy_constructible = std::is_trivially_copy_constructible<T>;
+#endif
+
+// gurl_base::in_place_t is an implementation of std::in_place_t from
+// C++17. A tag type used to request in-place construction in template vararg
+// constructors.
+
+// Specification:
+// https://en.cppreference.com/w/cpp/utility/in_place
+struct in_place_t {};
+constexpr in_place_t in_place = {};
+
+// gurl_base::in_place_type_t is an implementation of std::in_place_type_t from
+// C++17. A tag type used for in-place construction when the type to construct
+// needs to be specified, such as with gurl_base::unique_any, designed to be a
+// drop-in replacement.
+
+// Specification:
+// http://en.cppreference.com/w/cpp/utility/in_place
+template <typename T>
+struct in_place_type_t {};
+
+template <typename T>
+struct is_in_place_type_t {
+ static constexpr bool value = false;
+};
+
+template <typename... Ts>
+struct is_in_place_type_t<in_place_type_t<Ts...>> {
+ static constexpr bool value = true;
+};
+
+} // namespace base
+
+#undef CR_USE_FALLBACKS_FOR_GCC_WITH_LIBCXX
+#undef CR_USE_FALLBACKS_FOR_OLD_EXPERIMENTAL_GLIBCXX
+
+#endif // BASE_TEMPLATE_UTIL_H_
diff --git a/base/third_party/icu/BUILD b/base/third_party/icu/BUILD
new file mode 100644
index 0000000..97a033e
--- /dev/null
+++ b/base/third_party/icu/BUILD
@@ -0,0 +1,10 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "icu",
+ srcs = ["icu_utf.cc"],
+ hdrs = ["icu_utf.h"],
+ visibility = ["//visibility:public"],
+)
diff --git a/base/third_party/icu/LICENSE b/base/third_party/icu/LICENSE
new file mode 100644
index 0000000..2882e4e
--- /dev/null
+++ b/base/third_party/icu/LICENSE
@@ -0,0 +1,76 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2017 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
diff --git a/base/third_party/icu/README.chromium b/base/third_party/icu/README.chromium
new file mode 100644
index 0000000..297e89a
--- /dev/null
+++ b/base/third_party/icu/README.chromium
@@ -0,0 +1,17 @@
+Name: ICU
+URL: http://site.icu-project.org/
+Version: 60
+License: Unicode
+License File: NOT_SHIPPED
+
+This file has the relevant components from ICU copied to handle basic UTF8/16/32
+conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h
+into icu_utf.h, and from utf_impl.cpp into icu_utf.cc.
+
+The main change is that U_/U8_/U16_ prefixes have been replaced with
+CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU
+macros should ICU be in use on the system. For the same reason, the functions
+and types have been put in the "base_icu" namespace.
+
+Note that this license file is marked as NOT_SHIPPED, since a more complete
+ICU license is included from //third_party/icu/README.chromium
diff --git a/base/third_party/icu/icu_utf.cc b/base/third_party/icu/icu_utf.cc
new file mode 100644
index 0000000..a3262b0
--- /dev/null
+++ b/base/third_party/icu/icu_utf.cc
@@ -0,0 +1,131 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 1999-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: utf_impl.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep13
+* created by: Markus W. Scherer
+*
+* This file provides implementation functions for macros in the utfXX.h
+* that would otherwise be too long as macros.
+*/
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base_icu {
+
+// source/common/utf_impl.cpp
+
+static const UChar32
+utf8_errorValue[6]={
+ // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
+ // but without relying on the obsolete unicode/utf_old.h.
+ 0x15, 0x9f, 0xffff,
+ 0x10ffff
+};
+
+static UChar32
+errorValue(int32_t count, int8_t strict) {
+ if(strict>=0) {
+ return utf8_errorValue[count];
+ } else if(strict==-3) {
+ return 0xfffd;
+ } else {
+ return CBU_SENTINEL;
+ }
+}
+
+/*
+ * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
+ * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
+ *
+ * U8_NEXT() supports NUL-terminated strings indicated via length<0.
+ *
+ * The "strict" parameter controls the error behavior:
+ * <0 "Safe" behavior of U8_NEXT():
+ * -1: All illegal byte sequences yield U_SENTINEL=-1.
+ * -2: Same as -1, except for lenient treatment of surrogate code points as legal.
+ * Some implementations use this for roundtripping of
+ * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
+ * contain unpaired surrogates.
+ * -3: All illegal byte sequences yield U+FFFD.
+ * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
+ * All illegal byte sequences yield a positive code point such that this
+ * result code point would be encoded with the same number of bytes as
+ * the illegal sequence.
+ * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
+ * Same as the obsolete "safe" behavior, but non-characters are also treated
+ * like illegal sequences.
+ *
+ * Note that a UBool is the same as an int8_t.
+ */
+UChar32
+utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+ // *pi is one after byte c.
+ int32_t i=*pi;
+ // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+ if(i==length || c>0xf4) {
+ // end of string, or not a lead byte
+ } else if(c>=0xf0) {
+ // Test for 4-byte sequences first because
+ // U8_NEXT() handles shorter valid sequences inline.
+ uint8_t t1=s[i], t2, t3;
+ c&=7;
+ if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+ ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
+ *pi=i;
+ return c;
+ }
+ }
+ } else if(c>=0xe0) {
+ c&=0xf;
+ if(strict!=-2) {
+ uint8_t t1=s[i], t2;
+ if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<12)|((t1&0x3f)<<6)|t2;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
+ *pi=i;
+ return c;
+ }
+ }
+ } else {
+ // strict=-2 -> lenient: allow surrogates
+ uint8_t t1=s[i]-0x80, t2;
+ if(t1<=0x3f && (c>0 || t1>=0x20) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ *pi=i+1;
+ return (c<<12)|(t1<<6)|t2;
+ }
+ }
+ } else if(c>=0xc2) {
+ uint8_t t1=s[i]-0x80;
+ if(t1<=0x3f) {
+ *pi=i+1;
+ return ((c-0xc0)<<6)|t1;
+ }
+ } // else 0x80<=c<0xc2 is not a lead byte
+
+ /* error handling */
+ c=errorValue(i-*pi, strict);
+ *pi=i;
+ return c;
+}
+
+} // namespace base_icu
diff --git a/base/third_party/icu/icu_utf.h b/base/third_party/icu/icu_utf.h
new file mode 100644
index 0000000..2ba8231
--- /dev/null
+++ b/base/third_party/icu/icu_utf.h
@@ -0,0 +1,442 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 1999-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+*/
+
+#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
+
+#include <stdint.h>
+
+namespace base_icu {
+
+// source/common/unicode/umachine.h
+
+/** The ICU boolean type @stable ICU 2.0 */
+typedef int8_t UBool;
+
+/**
+ * Define UChar32 as a type for single Unicode code points.
+ * UChar32 is a signed 32-bit integer (same as int32_t).
+ *
+ * The Unicode code point range is 0..0x10ffff.
+ * All other values (negative or >=0x110000) are illegal as Unicode code points.
+ * They may be used as sentinel values to indicate "done", "error"
+ * or similar non-code point conditions.
+ *
+ * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
+ * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
+ * or else to be uint32_t.
+ * That is, the definition of UChar32 was platform-dependent.
+ *
+ * @see U_SENTINEL
+ * @stable ICU 2.4
+ */
+typedef int32_t UChar32;
+
+/**
+ * This value is intended for sentinel values for APIs that
+ * (take or) return single code points (UChar32).
+ * It is outside of the Unicode code point range 0..0x10ffff.
+ *
+ * For example, a "done" or "error" value in a new API
+ * could be indicated with U_SENTINEL.
+ *
+ * ICU APIs designed before ICU 2.4 usually define service-specific "done"
+ * values, mostly 0xffff.
+ * Those may need to be distinguished from
+ * actual U+ffff text contents by calling functions like
+ * CharacterIterator::hasNext() or UnicodeString::length().
+ *
+ * @return -1
+ * @see UChar32
+ * @stable ICU 2.4
+ */
+#define CBU_SENTINEL (-1)
+
+// source/common/unicode/utf.h
+
+/**
+ * Is this code point a Unicode noncharacter?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_NONCHAR(c) \
+ ((c)>=0xfdd0 && \
+ ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
+
+/**
+ * Is c a Unicode code point value (0..U+10ffff)
+ * that can be assigned a character?
+ *
+ * Code points that are not characters include:
+ * - single surrogate code points (U+d800..U+dfff, 2048 code points)
+ * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
+ * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
+ * - the highest Unicode code point value is U+10ffff
+ *
+ * This means that all code points below U+d800 are character code points,
+ * and that boundary is tested first for performance.
+ *
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_UNICODE_CHAR(c) \
+ ((uint32_t)(c)<0xd800 || \
+ (0xdfff<(c) && (c)<=0x10ffff && !CBU_IS_UNICODE_NONCHAR(c)))
+
+/**
+ * Is this code point a surrogate (U+d800..U+dfff)?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
+
+/**
+ * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 32-bit code point
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+// source/common/unicode/utf8.h
+
+/**
+ * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * Lead byte E0..EF bits 3..0 are used as byte index,
+ * first trail byte bits 7..5 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD3_AND_T1
+ * @internal
+ */
+#define CBU8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
+
+/**
+ * Internal 3-byte UTF-8 validity check.
+ * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
+ * @internal
+ */
+#define CBU8_IS_VALID_LEAD3_AND_T1(lead, t1) (CBU8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
+
+/**
+ * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * First trail byte bits 7..4 are used as byte index,
+ * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD4_AND_T1
+ * @internal
+ */
+#define CBU8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
+
+/**
+ * Internal 4-byte UTF-8 validity check.
+ * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
+ * @internal
+ */
+#define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
+
+/**
+ * Function for handling "next code point" with error-checking.
+ *
+ * This is internal since it is not meant to be called directly by external clie
+nts;
+ * however it is U_STABLE (not U_INTERNAL) since it is called by public macros i
+n this
+ * file and thus must remain stable, and should not be hidden when other interna
+l
+ * functions are hidden (otherwise public macros would fail to compile).
+ * @internal
+ */
+UChar32
+utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu::UChar32 c, ::base_icu::UBool strict);
+
+/**
+ * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
+
+/**
+ * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
+
+/**
+ * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
+
+/**
+ * How many code units (bytes) are used for the UTF-8 encoding
+ * of this Unicode code point?
+ * @param c 32-bit code point
+ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+ * @stable ICU 2.4
+ */
+#define CBU8_LENGTH(c) \
+ ((uint32_t)(c)<=0x7f ? 1 : \
+ ((uint32_t)(c)<=0x7ff ? 2 : \
+ ((uint32_t)(c)<=0xd7ff ? 3 : \
+ ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
+ ((uint32_t)(c)<=0xffff ? 3 : 4)\
+ ) \
+ ) \
+ ) \
+ )
+
+/**
+ * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+ * @return 4
+ * @stable ICU 2.4
+ */
+#define CBU8_MAX_LENGTH 4
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to a negative value.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see U8_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU8_NEXT(s, i, length, c) { \
+ (c)=(uint8_t)(s)[(i)++]; \
+ if(!CBU8_IS_SINGLE(c)) { \
+ uint8_t __t1, __t2; \
+ if( /* handle U+0800..U+FFFF inline */ \
+ (0xe0<=(c) && (c)<0xf0) && \
+ (((i)+1)<(length) || (length)<0) && \
+ CBU8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
+ (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
+ (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
+ (i)+=2; \
+ } else if( /* handle U+0080..U+07FF inline */ \
+ ((c)<0xe0 && (c)>=0xc2) && \
+ ((i)!=(length)) && \
+ (__t1=(s)[i]-0x80)<=0x3f) { \
+ (c)=(((c)&0x1f)<<6)|__t1; \
+ ++(i); \
+ } else { \
+ /* function call for "complicated" and error cases */ \
+ (c)=::base_icu::utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const uint8_t * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see U8_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU8_APPEND_UNSAFE(s, i, c) { \
+ if((uint32_t)(c)<=0x7f) { \
+ (s)[(i)++]=(uint8_t)(c); \
+ } else { \
+ if((uint32_t)(c)<=0x7ff) { \
+ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
+ } else { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
+ } else { \
+ (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
+ (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
+ } \
+}
+
+// source/common/unicode/utf16.h
+
+/**
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)
+
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+
+/**
+ * Is this code unit a surrogate (U+d800..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
+
+/**
+ * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+/**
+ * Helper constant for U16_GET_SUPPLEMENTARY.
+ * @internal
+ */
+#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+
+/**
+ * Get a supplementary code point value (U+10000..U+10ffff)
+ * from its lead and trail surrogates.
+ * The result is undefined if the input values are not
+ * lead and trail surrogates.
+ *
+ * @param lead lead surrogate (U+d800..U+dbff)
+ * @param trail trail surrogate (U+dc00..U+dfff)
+ * @return supplementary code point (U+10000..U+10ffff)
+ * @stable ICU 2.4
+ */
+#define CBU16_GET_SUPPLEMENTARY(lead, trail) \
+ (((::base_icu::UChar32)(lead)<<10UL)+(::base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
+
+/**
+ * Get the lead surrogate (0xd800..0xdbff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return lead surrogate (U+d800..U+dbff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_LEAD(supplementary) (::base_icu::UChar)(((supplementary)>>10)+0xd7c0)
+
+/**
+ * Get the trail surrogate (0xdc00..0xdfff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return trail surrogate (U+dc00..U+dfff) for supplementary
+ * @stable ICU 2.4
+ */
+#define CBU16_TRAIL(supplementary) (::base_icu::UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/**
+ * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
+ * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
+ * @param c 32-bit code point
+ * @return 1 or 2
+ * @stable ICU 2.4
+ */
+#define CBU16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+
+/**
+ * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
+ * @return 2
+ * @stable ICU 2.4
+ */
+#define CBU16_MAX_LENGTH 2
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
+ *
+ * @param s const UChar * string
+ * @param i string offset, must be i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define CBU16_NEXT(s, i, length, c) { \
+ (c)=(s)[(i)++]; \
+ if(CBU16_IS_LEAD(c)) { \
+ uint16_t __c2; \
+ if((i)!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see U16_APPEND
+ * @stable ICU 2.4
+ */
+#define CBU16_APPEND_UNSAFE(s, i, c) { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16_t)(c); \
+ } else { \
+ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } \
+}
+
+} // namesapce base_icu
+
+#endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
diff --git a/build/BUILD b/build/BUILD
new file mode 100644
index 0000000..f057fe5
--- /dev/null
+++ b/build/BUILD
@@ -0,0 +1,9 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "build_config",
+ hdrs = ["build_config.h"],
+ visibility = ["//visibility:public"],
+)
diff --git a/build/build_config.h b/build/build_config.h
new file mode 100644
index 0000000..0d87d80
--- /dev/null
+++ b/build/build_config.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// This file adds defines about the platform we're currently building on.
+// Operating System:
+// OS_WIN / OS_MACOSX / OS_LINUX / OS_POSIX (MACOSX or LINUX) /
+// OS_NACL (NACL_SFI or NACL_NONSFI) / OS_NACL_SFI / OS_NACL_NONSFI
+// OS_CHROMEOS is set by the build system
+// Compiler:
+// COMPILER_MSVC / COMPILER_GCC
+// Processor:
+// ARCH_CPU_X86 / ARCH_CPU_X86_64 / ARCH_CPU_X86_FAMILY (X86 or X86_64)
+// ARCH_CPU_32_BITS / ARCH_CPU_64_BITS
+
+#ifndef BUILD_BUILD_CONFIG_H_
+#define BUILD_BUILD_CONFIG_H_
+
+// A set of macros to use for platform detection.
+#if defined(__native_client__)
+// __native_client__ must be first, so that other OS_ defines are not set.
+#define OS_NACL 1
+// OS_NACL comes in two sandboxing technology flavors, SFI or Non-SFI.
+// PNaCl toolchain defines __native_client_nonsfi__ macro in Non-SFI build
+// mode, while it does not in SFI build mode.
+#if defined(__native_client_nonsfi__)
+#define OS_NACL_NONSFI
+#else
+#define OS_NACL_SFI
+#endif
+#elif defined(ANDROID)
+#define OS_ANDROID 1
+#elif defined(__APPLE__)
+// only include TargetConditions after testing ANDROID as some android builds
+// on mac don't have this header available and it's not needed unless the target
+// is really mac/ios.
+#include <TargetConditionals.h>
+#define OS_MACOSX 1
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#define OS_IOS 1
+#endif // defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
+#elif defined(__linux__)
+#define OS_LINUX 1
+// include a system header to pull in features.h for glibc/uclibc macros.
+#include <unistd.h>
+#if defined(__GLIBC__) && !defined(__UCLIBC__)
+// we really are using glibc, not uClibc pretending to be glibc
+#define LIBC_GLIBC 1
+#endif
+#elif defined(_WIN32)
+#define OS_WIN 1
+#elif defined(__Fuchsia__)
+#define OS_FUCHSIA 1
+#elif defined(__FreeBSD__)
+#define OS_FREEBSD 1
+#elif defined(__NetBSD__)
+#define OS_NETBSD 1
+#elif defined(__OpenBSD__)
+#define OS_OPENBSD 1
+#elif defined(__sun)
+#define OS_SOLARIS 1
+#elif defined(__QNXNTO__)
+#define OS_QNX 1
+#elif defined(_AIX)
+#define OS_AIX 1
+#elif defined(__asmjs__)
+#define OS_ASMJS
+#else
+#error Please add support for your platform in build/build_config.h
+#endif
+// NOTE: Adding a new port? Please follow
+// https://chromium.googlesource.com/chromium/src/+/master/docs/new_port_policy.md
+
+// For access to standard BSD features, use OS_BSD instead of a
+// more specific macro.
+#if defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD)
+#define OS_BSD 1
+#endif
+
+// For access to standard POSIXish features, use OS_POSIX instead of a
+// more specific macro.
+#if defined(OS_AIX) || defined(OS_ANDROID) || defined(OS_ASMJS) || \
+ defined(OS_FREEBSD) || defined(OS_LINUX) || defined(OS_MACOSX) || \
+ defined(OS_NACL) || defined(OS_NETBSD) || defined(OS_OPENBSD) || \
+ defined(OS_QNX) || defined(OS_SOLARIS)
+#define OS_POSIX 1
+#endif
+
+// Compiler detection.
+#if defined(__GNUC__)
+#define COMPILER_GCC 1
+#elif defined(_MSC_VER)
+#define COMPILER_MSVC 1
+#else
+#error Please add support for your compiler in build/build_config.h
+#endif
+
+// Processor architecture detection. For more info on what's defined, see:
+// http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+// http://www.agner.org/optimize/calling_conventions.pdf
+// or with gcc, run: "echo | gcc -E -dM -"
+#if defined(_M_X64) || defined(__x86_64__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86_64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(_M_IX86) || defined(__i386__)
+#define ARCH_CPU_X86_FAMILY 1
+#define ARCH_CPU_X86 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__s390x__)
+#define ARCH_CPU_S390_FAMILY 1
+#define ARCH_CPU_S390X 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_BIG_ENDIAN 1
+#elif defined(__s390__)
+#define ARCH_CPU_S390_FAMILY 1
+#define ARCH_CPU_S390 1
+#define ARCH_CPU_31_BITS 1
+#define ARCH_CPU_BIG_ENDIAN 1
+#elif (defined(__PPC64__) || defined(__PPC__)) && defined(__BIG_ENDIAN__)
+#define ARCH_CPU_PPC64_FAMILY 1
+#define ARCH_CPU_PPC64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_BIG_ENDIAN 1
+#elif defined(__PPC64__)
+#define ARCH_CPU_PPC64_FAMILY 1
+#define ARCH_CPU_PPC64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__ARMEL__)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARMEL 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define ARCH_CPU_ARM_FAMILY 1
+#define ARCH_CPU_ARM64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__pnacl__) || defined(__asmjs__)
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#elif defined(__MIPSEL__)
+#if defined(__LP64__)
+#define ARCH_CPU_MIPS_FAMILY 1
+#define ARCH_CPU_MIPS64EL 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#else
+#define ARCH_CPU_MIPS_FAMILY 1
+#define ARCH_CPU_MIPSEL 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_LITTLE_ENDIAN 1
+#endif
+#elif defined(__MIPSEB__)
+#if defined(__LP64__)
+#define ARCH_CPU_MIPS_FAMILY 1
+#define ARCH_CPU_MIPS64 1
+#define ARCH_CPU_64_BITS 1
+#define ARCH_CPU_BIG_ENDIAN 1
+#else
+#define ARCH_CPU_MIPS_FAMILY 1
+#define ARCH_CPU_MIPS 1
+#define ARCH_CPU_32_BITS 1
+#define ARCH_CPU_BIG_ENDIAN 1
+#endif
+#else
+#error Please add support for your architecture in build/build_config.h
+#endif
+
+// Type detection for wchar_t.
+#if defined(OS_WIN)
+#define WCHAR_T_IS_UTF16
+#elif defined(OS_FUCHSIA)
+#define WCHAR_T_IS_UTF32
+#elif defined(OS_POSIX) && defined(COMPILER_GCC) && defined(__WCHAR_MAX__) && \
+ (__WCHAR_MAX__ == 0x7fffffff || __WCHAR_MAX__ == 0xffffffff)
+#define WCHAR_T_IS_UTF32
+#elif defined(OS_POSIX) && defined(COMPILER_GCC) && defined(__WCHAR_MAX__) && \
+ (__WCHAR_MAX__ == 0x7fff || __WCHAR_MAX__ == 0xffff)
+// On Posix, we'll detect short wchar_t, but projects aren't guaranteed to
+// compile in this mode (in particular, Chrome doesn't). This is intended for
+// other projects using base who manage their own dependencies and make sure
+// short wchar works for them.
+#define WCHAR_T_IS_UTF16
+#else
+#error Please add support for your compiler in build/build_config.h
+#endif
+
+#if defined(OS_ANDROID)
+// The compiler thinks std::string::const_iterator and "const char*" are
+// equivalent types.
+#define STD_STRING_ITERATOR_IS_CHAR_POINTER
+// The compiler thinks gurl_base::string16::const_iterator and "char16*" are
+// equivalent types.
+#define BASE_STRING16_ITERATOR_IS_CHAR16_POINTER
+#endif
+
+#endif // BUILD_BUILD_CONFIG_H_
diff --git a/copy.bara.sky b/copy.bara.sky
new file mode 100644
index 0000000..02a615d
--- /dev/null
+++ b/copy.bara.sky
@@ -0,0 +1,103 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Ideally, we would import things from Chromium Git. However, checking out
+# Chromium is *really* slow, so we use a local checkout instead.
+origin = folder.origin()
+
+# Import all URL-related files, plus some parts of //base, primarily those
+# related to string handling.
+import_list = glob(
+ include = [
+ "AUTHORS",
+ "LICENSE",
+ "base/compiler_specific.h",
+ "base/macros.h",
+ "base/debug/leak_annotations.h",
+ "base/no_destructor.h",
+ "base/optional.h",
+ "base/stl_util.h",
+ "base/template_util.h",
+ "base/strings/*.cc",
+ "base/strings/*.h",
+ "base/third_party/icu/**",
+ "build/build_config.h",
+ "url/*.cc",
+ "url/*.h",
+ "url/third_party/mozilla/**",
+ ],
+ exclude = [
+ "url/url_idna_icu_alternatives*",
+ ],
+)
+
+target_files = glob(
+ include = [
+ "base/**",
+ "build/**",
+ "url/**",
+ "AUTHORS",
+ "LICENSE",
+ ],
+ exclude = [
+ "**/BUILD",
+ ],
+)
+
+# Those headers are pulled from //polyfill instead of copied from Chromium.
+# Should be in sync with //polyfill/BUILD.
+polyfilled_headers = [
+ "base/base_export.h",
+ "base/component_export.h",
+ "base/debug/alias.h",
+ "base/export_template.h",
+ "base/logging.h",
+ "base/trace_event/memory_usage_estimator.h",
+]
+
+transformations = [
+ # Prefix the logging-related macros.
+ core.replace(
+ "${log}",
+ "GURL_${log}",
+ regex_groups = {"log": "\\bD?(LOG|CHECK|CHECK_(EQ|LT|GT|LE|GE|NE))\\b"},
+ ),
+ core.replace("DCHECK_IS_ON", "GURL_DCHECK_IS_ON"),
+ core.replace("NOTREACHED()", "GURL_NOTREACHED()"),
+
+ # Rename base:: to gurl_base::
+ core.replace("namespace base ", "namespace gurl_base "),
+ core.replace("base::", "gurl_base::"),
+
+ # Ugly hack. In Chromium, ICU is built with UChar = uint16_t. We can't
+ # really do that with the system ICU, so we have to work this around with a
+ # cast.
+ core.replace(
+ "src, src_len, output->data(),",
+ "(UChar*)src, src_len, (UChar*)output->data(),",
+ ),
+
+ # Use system ICU.
+ core.replace(
+ '"third_party/icu/source/common/unicode/${file}.h"',
+ "<unicode/${file}.h>",
+ regex_groups = {"file": "\w+"},
+ ),
+]
+
+transformations += [
+ core.replace('#include "%s"' % header, '#include "polyfills/%s"' % header)
+ for header in polyfilled_headers
+]
+
+core.workflow(
+ name = "import",
+ origin = origin,
+ origin_files = import_list,
+ destination = folder.destination(),
+ destination_files = target_files,
+ authoring = authoring.pass_thru("GURL Maintainers <noreply@google.com>"),
+ mode = "SQUASH",
+ transformations = transformations,
+)
diff --git a/polyfills/BUILD b/polyfills/BUILD
new file mode 100644
index 0000000..e80d717
--- /dev/null
+++ b/polyfills/BUILD
@@ -0,0 +1,16 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "polyfills",
+ hdrs = [
+ "base/base_export.h",
+ "base/component_export.h",
+ "base/debug/alias.h",
+ "base/export_template.h",
+ "base/logging.h",
+ "base/trace_event/memory_usage_estimator.h",
+ ],
+ visibility = ["//visibility:public"],
+)
diff --git a/polyfills/base/base_export.h b/polyfills/base/base_export.h
new file mode 100644
index 0000000..209e910
--- /dev/null
+++ b/polyfills/base/base_export.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_BASE_EXPORT_H_
+#define POLYFILLS_BASE_BASE_EXPORT_H_
+
+#define BASE_EXPORT
+
+#endif /* POLYFILLS_BASE_BASE_EXPORT_H_ */
diff --git a/polyfills/base/component_export.h b/polyfills/base/component_export.h
new file mode 100644
index 0000000..3ce2ab1
--- /dev/null
+++ b/polyfills/base/component_export.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_COMPONENT_EXPORT_H_
+#define POLYFILLS_BASE_COMPONENT_EXPORT_H_
+
+#define COMPONENT_EXPORT(component)
+
+#endif /* POLYFILLS_BASE_COMPONENT_EXPORT_H_ */
diff --git a/polyfills/base/debug/alias.h b/polyfills/base/debug/alias.h
new file mode 100644
index 0000000..df9b5dc
--- /dev/null
+++ b/polyfills/base/debug/alias.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_DEBUG_ALIAS_H_
+#define POLYFILLS_BASE_DEBUG_ALIAS_H_
+
+#define DEBUG_ALIAS_FOR_CSTR(var_name, c_str, char_count)
+
+#endif // POLYFILLS_BASE_DEBUG_ALIAS_H_
diff --git a/polyfills/base/export_template.h b/polyfills/base/export_template.h
new file mode 100644
index 0000000..2b56e07
--- /dev/null
+++ b/polyfills/base/export_template.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_EXPORT_TEMPLATE_H_
+#define POLYFILLS_BASE_EXPORT_TEMPLATE_H_
+
+#define EXPORT_TEMPLATE_DEFINE(export)
+#define EXPORT_TEMPLATE_DECLARE(export)
+
+#endif /* POLYFILLS_BASE_EXPORT_TEMPLATE_H_ */
diff --git a/polyfills/base/logging.h b/polyfills/base/logging.h
new file mode 100644
index 0000000..def1745
--- /dev/null
+++ b/polyfills/base/logging.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_LOGGING_H_
+#define POLYFILLS_BASE_LOGGING_H_
+
+// The upstream header includes this, and some of the copied files actually rely
+// on this.
+#include <string.h>
+
+class GurlFakeLogSink {
+ public:
+ template <typename T1>
+ GurlFakeLogSink(T1) {}
+ template <typename T1, typename T2>
+ GurlFakeLogSink(T1, T2) {}
+
+ template<typename T>
+ GurlFakeLogSink& operator<<(const T&) { return *this; }
+};
+
+#define GURL_CHECK_LE(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_CHECK_NE(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_CHECK(statement) GurlFakeLogSink({statement})
+#define GURL_DCHECK_EQ(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_DCHECK_GT(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_DCHECK_IS_ON() false
+#define GURL_DCHECK_LE(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_DCHECK_LT(statement, statement2) GurlFakeLogSink({statement, statement2})
+#define GURL_DCHECK(statement) GurlFakeLogSink({statement})
+#define GURL_DLOG(severity) GurlFakeLogSink(true)
+#define GURL_LOG(severity) GurlFakeLogSink(true)
+#define GURL_NOTREACHED()
+
+#endif /* POLYFILLS_BASE_LOGGING_H_ */
diff --git a/polyfills/base/trace_event/memory_usage_estimator.h b/polyfills/base/trace_event/memory_usage_estimator.h
new file mode 100644
index 0000000..6ef1bc3
--- /dev/null
+++ b/polyfills/base/trace_event/memory_usage_estimator.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_
+#define POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_
+
+namespace gurl_base {
+namespace trace_event {
+
+template <class T>
+size_t EstimateMemoryUsage(const T& object) { return 0; }
+
+} // namespace trace_event
+} // namespace base
+
+#endif /* POLYFILLS_BASE_TRACE_EVENT_MEMORY_USAGE_ESTIMATOR_H_ */
diff --git a/test/BUILD b/test/BUILD
new file mode 100644
index 0000000..05c578b
--- /dev/null
+++ b/test/BUILD
@@ -0,0 +1,9 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_test(
+ name = "basic_test",
+ srcs = ["basic_test.cc"],
+ deps = ["//url"],
+)
diff --git a/test/basic_test.cc b/test/basic_test.cc
new file mode 100644
index 0000000..f60df57
--- /dev/null
+++ b/test/basic_test.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Basic smoke test to ensure that GURL works properly.
+
+#include "url/gurl.h"
+
+#include <cstdlib>
+#include <iostream>
+
+#define ASSERT_EQ(v1, v2) \
+ if ((v1) != (v2)) { \
+ std::cerr << "Expected equality of" << std::endl \
+ << " " << #v1 << " (equal to " << (v1) << ")" << std::endl \
+ << "and" << std::endl \
+ << " " << #v2 << " (equal to " << (v2) << ")" << std::endl; \
+ return 1; \
+ }
+
+int main(int argc, char** argv) {
+ GURL url("https://example.org/test?foo=bar#section");
+ ASSERT_EQ(url.scheme(), "https");
+ ASSERT_EQ(url.host(), "example.org");
+ ASSERT_EQ(url.EffectiveIntPort(), 443);
+ ASSERT_EQ(url.path(), "/test");
+ ASSERT_EQ(url.query(), "foo=bar");
+ ASSERT_EQ(url.ref(), "section");
+
+ // Ensure ICU is functioning correctly.
+ GURL idn_url("https://\xe5\x85\x89.example/");
+ ASSERT_EQ(idn_url.spec(), "https://xn--54q.example/");
+
+ return 0;
+}
diff --git a/url/BUILD b/url/BUILD
new file mode 100644
index 0000000..ec01ee3
--- /dev/null
+++ b/url/BUILD
@@ -0,0 +1,51 @@
+# Copyright 2019 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+cc_library(
+ name = "url",
+ srcs = [
+ "gurl.cc",
+ "third_party/mozilla/url_parse.cc",
+ "url_canon.cc",
+ "url_canon_etc.cc",
+ "url_canon_filesystemurl.cc",
+ "url_canon_fileurl.cc",
+ "url_canon_host.cc",
+ "url_canon_internal.cc",
+ "url_canon_internal.h",
+ "url_canon_internal_file.h",
+ "url_canon_ip.cc",
+ "url_canon_mailtourl.cc",
+ "url_canon_path.cc",
+ "url_canon_pathurl.cc",
+ "url_canon_query.cc",
+ "url_canon_relative.cc",
+ "url_canon_stdstring.cc",
+ "url_canon_stdurl.cc",
+ "url_constants.cc",
+ "url_idna_icu.cc",
+ "url_parse_file.cc",
+ "url_parse_internal.h",
+ "url_util.cc",
+ "url_util_internal.h",
+ ],
+ hdrs = [
+ "gurl.h",
+ "third_party/mozilla/url_parse.h",
+ "url_canon.h",
+ "url_canon_icu.h",
+ "url_canon_ip.h",
+ "url_canon_stdstring.h",
+ "url_constants.h",
+ "url_file.h",
+ "url_util.h",
+ ],
+ linkopts = ["-licuuc"],
+ visibility = ["//visibility:public"],
+ deps = [
+ "//base",
+ "//base/strings",
+ "//polyfills",
+ ],
+)
diff --git a/url/gurl.cc b/url/gurl.cc
new file mode 100644
index 0000000..c8e424f
--- /dev/null
+++ b/url/gurl.cc
@@ -0,0 +1,532 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/gurl.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <ostream>
+#include <utility>
+
+#include "polyfills/base/logging.h"
+#include "base/no_destructor.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
+#include "polyfills/base/trace_event/memory_usage_estimator.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_util.h"
+
+GURL::GURL() : is_valid_(false) {
+}
+
+GURL::GURL(const GURL& other)
+ : spec_(other.spec_),
+ is_valid_(other.is_valid_),
+ parsed_(other.parsed_) {
+ if (other.inner_url_)
+ inner_url_.reset(new GURL(*other.inner_url_));
+ // Valid filesystem urls should always have an inner_url_.
+ GURL_DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+}
+
+GURL::GURL(GURL&& other) noexcept
+ : spec_(std::move(other.spec_)),
+ is_valid_(other.is_valid_),
+ parsed_(other.parsed_),
+ inner_url_(std::move(other.inner_url_)) {
+ other.is_valid_ = false;
+ other.parsed_ = url::Parsed();
+}
+
+GURL::GURL(gurl_base::StringPiece url_string) {
+ InitCanonical(url_string, true);
+}
+
+GURL::GURL(gurl_base::StringPiece16 url_string) {
+ InitCanonical(url_string, true);
+}
+
+GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
+ InitCanonical(gurl_base::StringPiece(url_string), false);
+}
+
+GURL::GURL(const char* canonical_spec,
+ size_t canonical_spec_len,
+ const url::Parsed& parsed,
+ bool is_valid)
+ : spec_(canonical_spec, canonical_spec_len),
+ is_valid_(is_valid),
+ parsed_(parsed) {
+ InitializeFromCanonicalSpec();
+}
+
+GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid)
+ : spec_(std::move(canonical_spec)), is_valid_(is_valid), parsed_(parsed) {
+ InitializeFromCanonicalSpec();
+}
+
+template<typename STR>
+void GURL::InitCanonical(gurl_base::BasicStringPiece<STR> input_spec,
+ bool trim_path_end) {
+ url::StdStringCanonOutput output(&spec_);
+ is_valid_ = url::Canonicalize(
+ input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
+ NULL, &output, &parsed_);
+
+ output.Complete(); // Must be done before using string.
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
+ *parsed_.inner_parsed(), true));
+ }
+ // Valid URLs always have non-empty specs.
+ GURL_DCHECK(!is_valid_ || !spec_.empty());
+}
+
+void GURL::InitializeFromCanonicalSpec() {
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_.reset(
+ new GURL(spec_.data(), parsed_.Length(),
+ *parsed_.inner_parsed(), true));
+ }
+
+#ifndef NDEBUG
+ // For testing purposes, check that the parsed canonical URL is identical to
+ // what we would have produced. Skip checking for invalid URLs have no meaning
+ // and we can't always canonicalize then reproducibly.
+ if (is_valid_) {
+ GURL_DCHECK(!spec_.empty());
+ url::Component scheme;
+ // We can't do this check on the inner_url of a filesystem URL, as
+ // canonical_spec actually points to the start of the outer URL, so we'd
+ // end up with infinite recursion in this constructor.
+ if (!url::FindAndCompareScheme(spec_.data(), spec_.length(),
+ url::kFileSystemScheme, &scheme) ||
+ scheme.begin == parsed_.scheme.begin) {
+ // We need to retain trailing whitespace on path URLs, as the |parsed_|
+ // spec we originally received may legitimately contain trailing white-
+ // space on the path or components e.g. if the #ref has been
+ // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
+ GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
+
+ GURL_DCHECK(test_url.is_valid_ == is_valid_);
+ GURL_DCHECK(test_url.spec_ == spec_);
+
+ GURL_DCHECK(test_url.parsed_.scheme == parsed_.scheme);
+ GURL_DCHECK(test_url.parsed_.username == parsed_.username);
+ GURL_DCHECK(test_url.parsed_.password == parsed_.password);
+ GURL_DCHECK(test_url.parsed_.host == parsed_.host);
+ GURL_DCHECK(test_url.parsed_.port == parsed_.port);
+ GURL_DCHECK(test_url.parsed_.path == parsed_.path);
+ GURL_DCHECK(test_url.parsed_.query == parsed_.query);
+ GURL_DCHECK(test_url.parsed_.ref == parsed_.ref);
+ }
+ }
+#endif
+}
+
+GURL::~GURL() = default;
+
+GURL& GURL::operator=(const GURL& other) {
+ spec_ = other.spec_;
+ is_valid_ = other.is_valid_;
+ parsed_ = other.parsed_;
+
+ if (!other.inner_url_)
+ inner_url_.reset();
+ else if (inner_url_)
+ *inner_url_ = *other.inner_url_;
+ else
+ inner_url_.reset(new GURL(*other.inner_url_));
+
+ return *this;
+}
+
+GURL& GURL::operator=(GURL&& other) noexcept {
+ spec_ = std::move(other.spec_);
+ is_valid_ = other.is_valid_;
+ parsed_ = other.parsed_;
+ inner_url_ = std::move(other.inner_url_);
+
+ other.is_valid_ = false;
+ other.parsed_ = url::Parsed();
+ return *this;
+}
+
+const std::string& GURL::spec() const {
+ if (is_valid_ || spec_.empty())
+ return spec_;
+
+ GURL_DCHECK(false) << "Trying to get the spec of an invalid URL!";
+ return gurl_base::EmptyString();
+}
+
+bool GURL::operator<(const GURL& other) const {
+ return spec_ < other.spec_;
+}
+
+bool GURL::operator>(const GURL& other) const {
+ return spec_ > other.spec_;
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::Resolve(gurl_base::StringPiece relative) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+ url::StdStringCanonOutput output(&result.spec_);
+ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
+ parsed_, relative.data(),
+ static_cast<int>(relative.length()),
+ nullptr, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(
+ new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::Resolve(gurl_base::StringPiece16 relative) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+ url::StdStringCanonOutput output(&result.spec_);
+ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
+ parsed_, relative.data(),
+ static_cast<int>(relative.length()),
+ nullptr, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(
+ new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url::Replacements<char>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ url::StdStringCanonOutput output(&result.spec_);
+ result.is_valid_ = url::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(new GURL(result.spec_.data(),
+ result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url::Replacements<gurl_base::char16>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ url::StdStringCanonOutput output(&result.spec_);
+ result.is_valid_ = url::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_.reset(new GURL(result.spec_.data(),
+ result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true));
+ }
+ return result;
+}
+
+GURL GURL::GetOrigin() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL.
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ if (SchemeIsFileSystem())
+ return inner_url_->GetOrigin();
+
+ url::Replacements<char> replacements;
+ replacements.ClearUsername();
+ replacements.ClearPassword();
+ replacements.ClearPath();
+ replacements.ClearQuery();
+ replacements.ClearRef();
+
+ return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetAsReferrer() const {
+ if (!SchemeIsValidForReferrer())
+ return GURL();
+
+ if (!has_ref() && !has_username() && !has_password())
+ return GURL(*this);
+
+ url::Replacements<char> replacements;
+ replacements.ClearRef();
+ replacements.ClearUsername();
+ replacements.ClearPassword();
+ return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetWithEmptyPath() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL.
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ // We could optimize this since we know that the URL is canonical, and we are
+ // appending a canonical path, so avoiding re-parsing.
+ GURL other(*this);
+ if (parsed_.path.len == 0)
+ return other;
+
+ // Clear everything after the path.
+ other.parsed_.query.reset();
+ other.parsed_.ref.reset();
+
+ // Set the path, since the path is longer than one, we can just set the
+ // first character and resize.
+ other.spec_[other.parsed_.path.begin] = '/';
+ other.parsed_.path.len = 1;
+ other.spec_.resize(other.parsed_.path.begin + 1);
+ return other;
+}
+
+GURL GURL::GetWithoutFilename() const {
+ return Resolve(".");
+}
+
+bool GURL::IsStandard() const {
+ return url::IsStandard(spec_.data(), parsed_.scheme);
+}
+
+bool GURL::IsAboutBlank() const {
+ return IsAboutUrl(url::kAboutBlankPath);
+}
+
+bool GURL::IsAboutSrcdoc() const {
+ return IsAboutUrl(url::kAboutSrcdocPath);
+}
+
+bool GURL::SchemeIs(gurl_base::StringPiece lower_ascii_scheme) const {
+ GURL_DCHECK(gurl_base::IsStringASCII(lower_ascii_scheme));
+ GURL_DCHECK(gurl_base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme);
+
+ if (parsed_.scheme.len <= 0)
+ return lower_ascii_scheme.empty();
+ return scheme_piece() == lower_ascii_scheme;
+}
+
+bool GURL::SchemeIsHTTPOrHTTPS() const {
+ return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
+}
+
+bool GURL::SchemeIsValidForReferrer() const {
+ return is_valid_ && IsReferrerScheme(spec_.data(), parsed_.scheme);
+}
+
+bool GURL::SchemeIsWSOrWSS() const {
+ return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
+}
+
+bool GURL::SchemeIsCryptographic() const {
+ if (parsed_.scheme.len <= 0)
+ return false;
+ return SchemeIsCryptographic(scheme_piece());
+}
+
+bool GURL::SchemeIsCryptographic(gurl_base::StringPiece lower_ascii_scheme) {
+ GURL_DCHECK(gurl_base::IsStringASCII(lower_ascii_scheme));
+ GURL_DCHECK(gurl_base::ToLowerASCII(lower_ascii_scheme) == lower_ascii_scheme);
+
+ return lower_ascii_scheme == url::kHttpsScheme ||
+ lower_ascii_scheme == url::kWssScheme;
+}
+
+int GURL::IntPort() const {
+ if (parsed_.port.is_nonempty())
+ return url::ParsePort(spec_.data(), parsed_.port);
+ return url::PORT_UNSPECIFIED;
+}
+
+int GURL::EffectiveIntPort() const {
+ int int_port = IntPort();
+ if (int_port == url::PORT_UNSPECIFIED && IsStandard())
+ return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
+ parsed_.scheme.len);
+ return int_port;
+}
+
+std::string GURL::ExtractFileName() const {
+ url::Component file_component;
+ url::ExtractFileName(spec_.data(), parsed_.path, &file_component);
+ return ComponentString(file_component);
+}
+
+std::string GURL::PathForRequest() const {
+ GURL_DCHECK(parsed_.path.len > 0)
+ << "Canonical path for requests should be non-empty";
+ if (parsed_.ref.len >= 0) {
+ // Clip off the reference when it exists. The reference starts after the
+ // #-sign, so we have to subtract one to also remove it.
+ return std::string(spec_, parsed_.path.begin,
+ parsed_.ref.begin - parsed_.path.begin - 1);
+ }
+ // Compute the actual path length, rather than depending on the spec's
+ // terminator. If we're an inner_url, our spec continues on into our outer
+ // URL's path/query/ref.
+ int path_len = parsed_.path.len;
+ if (parsed_.query.is_valid())
+ path_len = parsed_.query.end() - parsed_.path.begin;
+
+ return std::string(spec_, parsed_.path.begin, path_len);
+}
+
+std::string GURL::HostNoBrackets() const {
+ return HostNoBracketsPiece().as_string();
+}
+
+gurl_base::StringPiece GURL::HostNoBracketsPiece() const {
+ // If host looks like an IPv6 literal, strip the square brackets.
+ url::Component h(parsed_.host);
+ if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
+ h.begin++;
+ h.len -= 2;
+ }
+ return ComponentStringPiece(h);
+}
+
+std::string GURL::GetContent() const {
+ if (!is_valid_)
+ return std::string();
+ std::string content = ComponentString(parsed_.GetContent());
+ if (!SchemeIs(url::kJavaScriptScheme) && parsed_.ref.len >= 0)
+ content.erase(content.size() - parsed_.ref.len - 1);
+ return content;
+}
+
+bool GURL::HostIsIPAddress() const {
+ return is_valid_ && url::HostIsIPAddress(host_piece());
+}
+
+const GURL& GURL::EmptyGURL() {
+ static gurl_base::NoDestructor<GURL> empty_gurl;
+ return *empty_gurl;
+}
+
+bool GURL::DomainIs(gurl_base::StringPiece canonical_domain) const {
+ if (!is_valid_)
+ return false;
+
+ // FileSystem URLs have empty host_piece, so check this first.
+ if (inner_url_ && SchemeIsFileSystem())
+ return inner_url_->DomainIs(canonical_domain);
+ return url::DomainIs(host_piece(), canonical_domain);
+}
+
+bool GURL::EqualsIgnoringRef(const GURL& other) const {
+ int ref_position = parsed_.CountCharactersBefore(url::Parsed::REF, true);
+ int ref_position_other =
+ other.parsed_.CountCharactersBefore(url::Parsed::REF, true);
+ return gurl_base::StringPiece(spec_).substr(0, ref_position) ==
+ gurl_base::StringPiece(other.spec_).substr(0, ref_position_other);
+}
+
+void GURL::Swap(GURL* other) {
+ spec_.swap(other->spec_);
+ std::swap(is_valid_, other->is_valid_);
+ std::swap(parsed_, other->parsed_);
+ inner_url_.swap(other->inner_url_);
+}
+
+size_t GURL::EstimateMemoryUsage() const {
+ return gurl_base::trace_event::EstimateMemoryUsage(spec_) +
+ gurl_base::trace_event::EstimateMemoryUsage(inner_url_) +
+ (parsed_.inner_parsed() ? sizeof(url::Parsed) : 0);
+}
+
+bool GURL::IsAboutUrl(gurl_base::StringPiece allowed_path) const {
+ if (!SchemeIs(url::kAboutScheme))
+ return false;
+
+ if (has_host() || has_username() || has_password() || has_port())
+ return false;
+
+ if (!path_piece().starts_with(allowed_path))
+ return false;
+
+ if (path_piece().size() == allowed_path.size()) {
+ GURL_DCHECK_EQ(path_piece(), allowed_path);
+ return true;
+ }
+
+ if ((path_piece().size() == allowed_path.size() + 1) &&
+ path_piece().back() == '/') {
+ GURL_DCHECK_EQ(path_piece(), allowed_path.as_string() + '/');
+ return true;
+ }
+
+ return false;
+}
+
+std::ostream& operator<<(std::ostream& out, const GURL& url) {
+ return out << url.possibly_invalid_spec();
+}
+
+bool operator==(const GURL& x, const GURL& y) {
+ return x.possibly_invalid_spec() == y.possibly_invalid_spec();
+}
+
+bool operator!=(const GURL& x, const GURL& y) {
+ return !(x == y);
+}
+
+bool operator==(const GURL& x, const gurl_base::StringPiece& spec) {
+ GURL_DCHECK_EQ(GURL(spec).possibly_invalid_spec(), spec);
+ return x.possibly_invalid_spec() == spec;
+}
+
+bool operator==(const gurl_base::StringPiece& spec, const GURL& x) {
+ return x == spec;
+}
+
+bool operator!=(const GURL& x, const gurl_base::StringPiece& spec) {
+ return !(x == spec);
+}
+
+bool operator!=(const gurl_base::StringPiece& spec, const GURL& x) {
+ return !(x == spec);
+}
diff --git a/url/gurl.h b/url/gurl.h
new file mode 100644
index 0000000..8c026f7
--- /dev/null
+++ b/url/gurl.h
@@ -0,0 +1,507 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_GURL_H_
+#define URL_GURL_H_
+
+#include <stddef.h>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+
+#include "polyfills/base/component_export.h"
+#include "polyfills/base/debug/alias.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+
+// Represents a URL. GURL is Google's URL parsing library.
+//
+// A parsed canonicalized URL is guaranteed to be UTF-8. Any non-ASCII input
+// characters are UTF-8 encoded and % escaped to ASCII.
+//
+// The string representation of a URL is called the spec(). Getting the
+// spec will assert if the URL is invalid to help protect against malicious
+// URLs. If you want the "best effort" canonicalization of an invalid URL, you
+// can use possibly_invalid_spec(). Test validity with is_valid(). Data and
+// javascript URLs use GetContent() to extract the data.
+//
+// This class has existence checkers and getters for the various components of
+// a URL. Existence is different than being nonempty. "http://www.google.com/?"
+// has a query that just happens to be empty, and has_query() will return true
+// while the query getters will return the empty string.
+//
+// Prefer not to modify a URL using string operations (though sometimes this is
+// unavoidable). Instead, use ReplaceComponents which can replace or delete
+// multiple parts of a URL in one step, doesn't re-canonicalize unchanged
+// sections, and avoids some screw-ups. An example is creating a URL with a
+// path that contains a literal '#'. Using string concatenation will generate a
+// URL with a truncated path and a reference fragment, while ReplaceComponents
+// will know to escape this and produce the desired result.
+class COMPONENT_EXPORT(URL) GURL {
+ public:
+ typedef url::StringPieceReplacements<std::string> Replacements;
+ typedef url::StringPieceReplacements<gurl_base::string16> ReplacementsW;
+
+ // Creates an empty, invalid URL.
+ GURL();
+
+ // Copy construction is relatively inexpensive, with most of the time going
+ // to reallocating the string. It does not re-parse.
+ GURL(const GURL& other);
+ GURL(GURL&& other) noexcept;
+
+ // The strings to this contructor should be UTF-8 / UTF-16.
+ explicit GURL(gurl_base::StringPiece url_string);
+ explicit GURL(gurl_base::StringPiece16 url_string);
+
+ // Constructor for URLs that have already been parsed and canonicalized. This
+ // is used for conversions from KURL, for example. The caller must supply all
+ // information associated with the URL, which must be correct and consistent.
+ GURL(const char* canonical_spec,
+ size_t canonical_spec_len,
+ const url::Parsed& parsed,
+ bool is_valid);
+ // Notice that we take the canonical_spec by value so that we can convert
+ // from WebURL without copying the string. When we call this constructor
+ // we pass in a temporary std::string, which lets the compiler skip the
+ // copy and just move the std::string into the function argument. In the
+ // implementation, we use std::move to move the data into the GURL itself,
+ // which means we end up with zero copies.
+ GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid);
+
+ ~GURL();
+
+ GURL& operator=(const GURL& other);
+ GURL& operator=(GURL&& other) noexcept;
+
+ // Returns true when this object represents a valid parsed URL. When not
+ // valid, other functions will still succeed, but you will not get canonical
+ // data out in the format you may be expecting. Instead, we keep something
+ // "reasonable looking" so that the user can see how it's busted if
+ // displayed to them.
+ bool is_valid() const {
+ return is_valid_;
+ }
+
+ // Returns true if the URL is zero-length. Note that empty URLs are also
+ // invalid, and is_valid() will return false for them. This is provided
+ // because some users may want to treat the empty case differently.
+ bool is_empty() const {
+ return spec_.empty();
+ }
+
+ // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8,
+ // if the URL is valid. If the URL is not valid, this will assert and return
+ // the empty string (for safety in release builds, to keep them from being
+ // misused which might be a security problem).
+ //
+ // The URL will be ASCII (non-ASCII characters will be %-escaped UTF-8).
+ //
+ // The exception is for empty() URLs (which are !is_valid()) but this will
+ // return the empty string without asserting.
+ //
+ // Use invalid_spec() below to get the unusable spec of an invalid URL. This
+ // separation is designed to prevent errors that may cause security problems
+ // that could result from the mistaken use of an invalid URL.
+ const std::string& spec() const;
+
+ // Returns the potentially invalid spec for a the URL. This spec MUST NOT be
+ // modified or sent over the network. It is designed to be displayed in error
+ // messages to the user, as the appearance of the spec may explain the error.
+ // If the spec is valid, the valid spec will be returned.
+ //
+ // The returned string is guaranteed to be valid UTF-8.
+ const std::string& possibly_invalid_spec() const {
+ return spec_;
+ }
+
+ // Getter for the raw parsed structure. This allows callers to locate parts
+ // of the URL within the spec themselves. Most callers should consider using
+ // the individual component getters below.
+ //
+ // The returned parsed structure will reference into the raw spec, which may
+ // or may not be valid. If you are using this to index into the spec, BE
+ // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you
+ // don't do anything "important" with invalid specs.
+ const url::Parsed& parsed_for_possibly_invalid_spec() const {
+ return parsed_;
+ }
+
+ // Allows GURL to used as a key in STL (for example, a std::set or std::map).
+ bool operator<(const GURL& other) const;
+ bool operator>(const GURL& other) const;
+
+ // Resolves a URL that's possibly relative to this object's URL, and returns
+ // it. Absolute URLs are also handled according to the rules of URLs on web
+ // pages.
+ //
+ // It may be impossible to resolve the URLs properly. If the input is not
+ // "standard" (IsStandard() == false) and the input looks relative, we can't
+ // resolve it. In these cases, the result will be an empty, invalid GURL.
+ //
+ // The result may also be a nonempty, invalid URL if the input has some kind
+ // of encoding error. In these cases, we will try to construct a "good" URL
+ // that may have meaning to the user, but it will be marked invalid.
+ //
+ // It is an error to resolve a URL relative to an invalid URL. The result
+ // will be the empty URL.
+ GURL Resolve(gurl_base::StringPiece relative) const;
+ GURL Resolve(gurl_base::StringPiece16 relative) const;
+
+ // Creates a new GURL by replacing the current URL's components with the
+ // supplied versions. See the Replacements class in url_canon.h for more.
+ //
+ // These are not particularly quick, so avoid doing mutations when possible.
+ // Prefer the 8-bit version when possible.
+ //
+ // It is an error to replace components of an invalid URL. The result will
+ // be the empty URL.
+ //
+ // Note that we use the more general url::Replacements type to give
+ // callers extra flexibility rather than our override.
+ GURL ReplaceComponents(const url::Replacements<char>& replacements) const;
+ GURL ReplaceComponents(
+ const url::Replacements<gurl_base::char16>& replacements) const;
+
+ // A helper function that is equivalent to replacing the path with a slash
+ // and clearing out everything after that. We sometimes need to know just the
+ // scheme and the authority. If this URL is not a standard URL (it doesn't
+ // have the regular authority and path sections), then the result will be
+ // an empty, invalid GURL. Note that this *does* work for file: URLs, which
+ // some callers may want to filter out before calling this.
+ //
+ // It is an error to get an empty path on an invalid URL. The result
+ // will be the empty URL.
+ GURL GetWithEmptyPath() const;
+
+ // A helper function to return a GURL without the filename, query values, and
+ // fragment. For example,
+ // GURL("https://www.foo.com/index.html?q=test").GetWithoutFilename().spec()
+ // will return "https://www.foo.com/".
+ // GURL("https://www.foo.com/bar/").GetWithoutFilename().spec()
+ // will return "https://www.foo.com/bar/". If the GURL is invalid or missing a
+ // scheme, authority or path, it will return an empty, invalid GURL.
+ GURL GetWithoutFilename() const;
+
+ // A helper function to return a GURL containing just the scheme, host,
+ // and port from a URL. Equivalent to clearing any username and password,
+ // replacing the path with a slash, and clearing everything after that. If
+ // this URL is not a standard URL, then the result will be an empty,
+ // invalid GURL. If the URL has neither username nor password, this
+ // degenerates to GetWithEmptyPath().
+ //
+ // It is an error to get the origin of an invalid URL. The result
+ // will be the empty URL.
+ GURL GetOrigin() const;
+
+ // A helper function to return a GURL stripped from the elements that are not
+ // supposed to be sent as HTTP referrer: username, password and ref fragment.
+ // For invalid URLs or URLs that no valid referrers, an empty URL will be
+ // returned.
+ GURL GetAsReferrer() const;
+
+ // Returns true if the scheme for the current URL is a known "standard-format"
+ // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic
+ // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes
+ // file: and filesystem:, which some callers may want to filter out explicitly
+ // by calling SchemeIsFile[System].
+ bool IsStandard() const;
+
+ // Returns true when the url is of the form about:blank, about:blank?foo or
+ // about:blank/#foo.
+ bool IsAboutBlank() const;
+
+ // Returns true when the url is of the form about:srcdoc, about:srcdoc?foo or
+ // about:srcdoc/#foo.
+ bool IsAboutSrcdoc() const;
+
+ // Returns true if the given parameter (should be lower-case ASCII to match
+ // the canonicalized scheme) is the scheme for this URL. Do not include a
+ // colon.
+ bool SchemeIs(gurl_base::StringPiece lower_ascii_scheme) const;
+
+ // Returns true if the scheme is "http" or "https".
+ bool SchemeIsHTTPOrHTTPS() const;
+
+ // Returns true if the scheme is valid for use as a referrer.
+ bool SchemeIsValidForReferrer() const;
+
+ // Returns true is the scheme is "ws" or "wss".
+ bool SchemeIsWSOrWSS() const;
+
+ // We often need to know if this is a file URL. File URLs are "standard", but
+ // are often treated separately by some programs.
+ bool SchemeIsFile() const {
+ return SchemeIs(url::kFileScheme);
+ }
+
+ // FileSystem URLs need to be treated differently in some cases.
+ bool SchemeIsFileSystem() const {
+ return SchemeIs(url::kFileSystemScheme);
+ }
+
+ // Returns true if the scheme indicates a network connection that uses TLS or
+ // some other cryptographic protocol (e.g. QUIC) for security.
+ //
+ // This function is a not a complete test of whether or not an origin's code
+ // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a
+ // higher-level and more complete semantics. See that function's documentation
+ // for more detail.
+ bool SchemeIsCryptographic() const;
+
+ // As above, but static. Parameter should be lower-case ASCII.
+ static bool SchemeIsCryptographic(gurl_base::StringPiece lower_ascii_scheme);
+
+ // Returns true if the scheme is "blob".
+ bool SchemeIsBlob() const {
+ return SchemeIs(url::kBlobScheme);
+ }
+
+ // For most URLs, the "content" is everything after the scheme (skipping the
+ // scheme delimiting colon) and before the fragment (skipping the fragment
+ // delimiting octothorpe). For javascript URLs the "content" also includes the
+ // fragment delimiter and fragment.
+ //
+ // It is an error to get the content of an invalid URL: the result will be an
+ // empty string.
+ std::string GetContent() const;
+
+ // Returns true if the hostname is an IP address. Note: this function isn't
+ // as cheap as a simple getter because it re-parses the hostname to verify.
+ bool HostIsIPAddress() const;
+
+ // Not including the colon. If you are comparing schemes, prefer SchemeIs.
+ bool has_scheme() const {
+ return parsed_.scheme.len >= 0;
+ }
+ std::string scheme() const {
+ return ComponentString(parsed_.scheme);
+ }
+ gurl_base::StringPiece scheme_piece() const {
+ return ComponentStringPiece(parsed_.scheme);
+ }
+
+ bool has_username() const {
+ return parsed_.username.len >= 0;
+ }
+ std::string username() const {
+ return ComponentString(parsed_.username);
+ }
+ gurl_base::StringPiece username_piece() const {
+ return ComponentStringPiece(parsed_.username);
+ }
+
+ bool has_password() const {
+ return parsed_.password.len >= 0;
+ }
+ std::string password() const {
+ return ComponentString(parsed_.password);
+ }
+ gurl_base::StringPiece password_piece() const {
+ return ComponentStringPiece(parsed_.password);
+ }
+
+ // The host may be a hostname, an IPv4 address, or an IPv6 literal surrounded
+ // by square brackets, like "[2001:db8::1]". To exclude these brackets, use
+ // HostNoBrackets() below.
+ bool has_host() const {
+ // Note that hosts are special, absence of host means length 0.
+ return parsed_.host.len > 0;
+ }
+ std::string host() const {
+ return ComponentString(parsed_.host);
+ }
+ gurl_base::StringPiece host_piece() const {
+ return ComponentStringPiece(parsed_.host);
+ }
+
+ // The port if one is explicitly specified. Most callers will want IntPort()
+ // or EffectiveIntPort() instead of these. The getters will not include the
+ // ':'.
+ bool has_port() const {
+ return parsed_.port.len >= 0;
+ }
+ std::string port() const {
+ return ComponentString(parsed_.port);
+ }
+ gurl_base::StringPiece port_piece() const {
+ return ComponentStringPiece(parsed_.port);
+ }
+
+ // Including first slash following host, up to the query. The URL
+ // "http://www.google.com/" has a path of "/".
+ bool has_path() const {
+ return parsed_.path.len >= 0;
+ }
+ std::string path() const {
+ return ComponentString(parsed_.path);
+ }
+ gurl_base::StringPiece path_piece() const {
+ return ComponentStringPiece(parsed_.path);
+ }
+
+ // Stuff following '?' up to the ref. The getters will not include the '?'.
+ bool has_query() const {
+ return parsed_.query.len >= 0;
+ }
+ std::string query() const {
+ return ComponentString(parsed_.query);
+ }
+ gurl_base::StringPiece query_piece() const {
+ return ComponentStringPiece(parsed_.query);
+ }
+
+ // Stuff following '#' to the end of the string. This will be %-escaped UTF-8.
+ // The getters will not include the '#'.
+ bool has_ref() const {
+ return parsed_.ref.len >= 0;
+ }
+ std::string ref() const {
+ return ComponentString(parsed_.ref);
+ }
+ gurl_base::StringPiece ref_piece() const {
+ return ComponentStringPiece(parsed_.ref);
+ }
+
+ // Returns a parsed version of the port. Can also be any of the special
+ // values defined in Parsed for ExtractPort.
+ int IntPort() const;
+
+ // Returns the port number of the URL, or the default port number.
+ // If the scheme has no concept of port (or unknown default) returns
+ // PORT_UNSPECIFIED.
+ int EffectiveIntPort() const;
+
+ // Extracts the filename portion of the path and returns it. The filename
+ // is everything after the last slash in the path. This may be empty.
+ std::string ExtractFileName() const;
+
+ // Returns the path that should be sent to the server. This is the path,
+ // parameter, and query portions of the URL. It is guaranteed to be ASCII.
+ std::string PathForRequest() const;
+
+ // Returns the host, excluding the square brackets surrounding IPv6 address
+ // literals. This can be useful for passing to getaddrinfo().
+ std::string HostNoBrackets() const;
+
+ // Returns the same characters as HostNoBrackets(), avoiding a copy.
+ gurl_base::StringPiece HostNoBracketsPiece() const;
+
+ // Returns true if this URL's host matches or is in the same domain as
+ // the given input string. For example, if the hostname of the URL is
+ // "www.google.com", this will return true for "com", "google.com", and
+ // "www.google.com".
+ //
+ // The input domain should match host canonicalization rules. i.e. the input
+ // should be lowercase except for escape chars.
+ //
+ // This call is more efficient than getting the host and checking whether the
+ // host has the specific domain or not because no copies or object
+ // constructions are done.
+ bool DomainIs(gurl_base::StringPiece canonical_domain) const;
+
+ // Checks whether or not two URLs differ only in the ref (the part after
+ // the # character).
+ bool EqualsIgnoringRef(const GURL& other) const;
+
+ // Swaps the contents of this GURL object with |other|, without doing
+ // any memory allocations.
+ void Swap(GURL* other);
+
+ // Returns a reference to a singleton empty GURL. This object is for callers
+ // who return references but don't have anything to return in some cases.
+ // If you just want an empty URL for normal use, prefer GURL(). This function
+ // may be called from any thread.
+ static const GURL& EmptyGURL();
+
+ // Returns the inner URL of a nested URL (currently only non-null for
+ // filesystem URLs).
+ //
+ // TODO(mmenke): inner_url().spec() currently returns the same value as
+ // caling spec() on the GURL itself. This should be fixed.
+ // See https://crbug.com/619596
+ const GURL* inner_url() const {
+ return inner_url_.get();
+ }
+
+ // Estimates dynamic memory usage.
+ // See base/trace_event/memory_usage_estimator.h for more info.
+ size_t EstimateMemoryUsage() const;
+
+ private:
+ // Variant of the string parsing constructor that allows the caller to elect
+ // retain trailing whitespace, if any, on the passed URL spec, but only if
+ // the scheme is one that allows trailing whitespace. The primary use-case is
+ // for data: URLs. In most cases, you want to use the single parameter
+ // constructor above.
+ enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
+ GURL(const std::string& url_string, RetainWhiteSpaceSelector);
+
+ template<typename STR>
+ void InitCanonical(gurl_base::BasicStringPiece<STR> input_spec,
+ bool trim_path_end);
+
+ void InitializeFromCanonicalSpec();
+
+ // Helper used by IsAboutBlank and IsAboutSrcdoc.
+ bool IsAboutUrl(gurl_base::StringPiece allowed_path) const;
+
+ // Returns the substring of the input identified by the given component.
+ std::string ComponentString(const url::Component& comp) const {
+ if (comp.len <= 0)
+ return std::string();
+ return std::string(spec_, comp.begin, comp.len);
+ }
+ gurl_base::StringPiece ComponentStringPiece(const url::Component& comp) const {
+ if (comp.len <= 0)
+ return gurl_base::StringPiece();
+ return gurl_base::StringPiece(&spec_[comp.begin], comp.len);
+ }
+
+ // The actual text of the URL, in canonical ASCII form.
+ std::string spec_;
+
+ // Set when the given URL is valid. Otherwise, we may still have a spec and
+ // components, but they may not identify valid resources (for example, an
+ // invalid port number, invalid characters in the scheme, etc.).
+ bool is_valid_;
+
+ // Identified components of the canonical spec.
+ url::Parsed parsed_;
+
+ // Used for nested schemes [currently only filesystem:].
+ std::unique_ptr<GURL> inner_url_;
+};
+
+// Stream operator so GURL can be used in assertion statements.
+COMPONENT_EXPORT(URL)
+std::ostream& operator<<(std::ostream& out, const GURL& url);
+
+COMPONENT_EXPORT(URL) bool operator==(const GURL& x, const GURL& y);
+COMPONENT_EXPORT(URL) bool operator!=(const GURL& x, const GURL& y);
+
+// Equality operator for comparing raw spec_. This should be used in place of
+// url == GURL(spec) where |spec| is known (i.e. constants). This is to prevent
+// needlessly re-parsing |spec| into a temporary GURL.
+COMPONENT_EXPORT(URL)
+bool operator==(const GURL& x, const gurl_base::StringPiece& spec);
+COMPONENT_EXPORT(URL)
+bool operator==(const gurl_base::StringPiece& spec, const GURL& x);
+COMPONENT_EXPORT(URL)
+bool operator!=(const GURL& x, const gurl_base::StringPiece& spec);
+COMPONENT_EXPORT(URL)
+bool operator!=(const gurl_base::StringPiece& spec, const GURL& x);
+
+// DEBUG_ALIAS_FOR_GURL(var_name, url) copies |url| into a new stack-allocated
+// variable named |<var_name>|. This helps ensure that the value of |url| gets
+// preserved in crash dumps.
+#define DEBUG_ALIAS_FOR_GURL(var_name, url) \
+ DEBUG_ALIAS_FOR_CSTR(var_name, (url).possibly_invalid_spec().c_str(), 128)
+
+#endif // URL_GURL_H_
diff --git a/url/gurl_fuzzer.cc b/url/gurl_fuzzer.cc
new file mode 100644
index 0000000..71f3540
--- /dev/null
+++ b/url/gurl_fuzzer.cc
@@ -0,0 +1,57 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/at_exit.h"
+#include "base/i18n/icu_util.h"
+#include "url/gurl.h"
+
+struct TestCase {
+ TestCase() { GURL_CHECK(gurl_base::i18n::InitializeICU()); }
+
+ // used by ICU integration.
+ gurl_base::AtExitManager at_exit_manager;
+};
+
+TestCase* test_case = new TestCase();
+
+// Entry point for LibFuzzer.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ if (size < 1)
+ return 0;
+
+ gurl_base::StringPiece string_piece_input(reinterpret_cast<const char*>(data),
+ size);
+ GURL url_from_string_piece(string_piece_input);
+
+ // Test for StringPiece16 if size is even.
+ if (size % 2 == 0) {
+ gurl_base::StringPiece16 string_piece_input16(
+ reinterpret_cast<const gurl_base::char16*>(data), size / 2);
+
+ GURL url_from_string_piece16(string_piece_input16);
+ }
+
+ // Resolve relative url tests.
+ size_t size_t_bytes = sizeof(size_t);
+ if (size < size_t_bytes + 1) {
+ return 0;
+ }
+ size_t relative_size =
+ *reinterpret_cast<const size_t*>(data) % (size - size_t_bytes);
+ std::string relative_string(
+ reinterpret_cast<const char*>(data + size_t_bytes), relative_size);
+ gurl_base::StringPiece string_piece_part_input(
+ reinterpret_cast<const char*>(data + size_t_bytes + relative_size),
+ size - relative_size - size_t_bytes);
+ GURL url_from_string_piece_part(string_piece_part_input);
+ url_from_string_piece_part.Resolve(relative_string);
+
+ if (relative_size % 2 == 0) {
+ gurl_base::string16 relative_string16(
+ reinterpret_cast<const gurl_base::char16*>(data + size_t_bytes),
+ relative_size / 2);
+ url_from_string_piece_part.Resolve(relative_string16);
+ }
+ return 0;
+}
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
new file mode 100644
index 0000000..0375eae
--- /dev/null
+++ b/url/gurl_unittest.cc
@@ -0,0 +1,963 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include "base/stl_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_test_utils.h"
+
+namespace url {
+
+namespace {
+
+template<typename CHAR>
+void SetupReplacement(
+ void (Replacements<CHAR>::*func)(const CHAR*, const Component&),
+ Replacements<CHAR>* replacements,
+ const CHAR* str) {
+ if (str) {
+ Component comp;
+ if (str[0])
+ comp.len = static_cast<int>(strlen(str));
+ (replacements->*func)(str, comp);
+ }
+}
+
+// Returns the canonicalized string for the given URL string for the
+// GURLTest.Types test.
+std::string TypesTestCase(const char* src) {
+ GURL gurl(src);
+ return gurl.possibly_invalid_spec();
+}
+
+} // namespace
+
+// Different types of URLs should be handled differently, and handed off to
+// different canonicalizers.
+TEST(GURLTest, Types) {
+ // URLs with unknown schemes should be treated as path URLs, even when they
+ // have things like "://".
+ EXPECT_EQ("something:///HOSTNAME.com/",
+ TypesTestCase("something:///HOSTNAME.com/"));
+
+ // Conversely, URLs with known schemes should always trigger standard URL
+ // handling.
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
+
+#ifdef WIN32
+ // URLs that look like Windows absolute path specs.
+ EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
+ EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
+#endif
+}
+
+// Test the basic creation and querying of components in a GURL. We assume that
+// the parser is already tested and works, so we are mostly interested if the
+// object does the right thing with the results.
+TEST(GURLTest, Components) {
+ GURL empty_url(gurl_base::UTF8ToUTF16(""));
+ EXPECT_TRUE(empty_url.is_empty());
+ EXPECT_FALSE(empty_url.is_valid());
+
+ GURL url(gurl_base::UTF8ToUTF16("http://user:pass@google.com:99/foo;bar?q=a#ref"));
+ EXPECT_FALSE(url.is_empty());
+ EXPECT_TRUE(url.is_valid());
+ EXPECT_TRUE(url.SchemeIs("http"));
+ EXPECT_FALSE(url.SchemeIsFile());
+
+ // This is the narrow version of the URL, which should match the wide input.
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("user", url.username());
+ EXPECT_EQ("pass", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("99", url.port());
+ EXPECT_EQ(99, url.IntPort());
+ EXPECT_EQ("/foo;bar", url.path());
+ EXPECT_EQ("q=a", url.query());
+ EXPECT_EQ("ref", url.ref());
+
+ // Test parsing userinfo with special characters.
+ GURL url_special_pass("http://user:%40!$&'()*+,;=:@google.com:12345");
+ EXPECT_TRUE(url_special_pass.is_valid());
+ // GURL canonicalizes some delimiters.
+ EXPECT_EQ("%40!$&%27()*+,%3B%3D%3A", url_special_pass.password());
+ EXPECT_EQ("google.com", url_special_pass.host());
+ EXPECT_EQ("12345", url_special_pass.port());
+}
+
+TEST(GURLTest, Empty) {
+ GURL url;
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("", url.spec());
+
+ EXPECT_EQ("", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("", url.host());
+ EXPECT_EQ("", url.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, url.IntPort());
+ EXPECT_EQ("", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Copy) {
+ GURL url(gurl_base::UTF8ToUTF16(
+ "http://user:pass@google.com:99/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("http", url2.scheme());
+ EXPECT_EQ("user", url2.username());
+ EXPECT_EQ("pass", url2.password());
+ EXPECT_EQ("google.com", url2.host());
+ EXPECT_EQ("99", url2.port());
+ EXPECT_EQ(99, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ // Copying of invalid URL should be invalid
+ GURL invalid;
+ GURL invalid2(invalid);
+ EXPECT_FALSE(invalid2.is_valid());
+ EXPECT_EQ("", invalid2.spec());
+ EXPECT_EQ("", invalid2.scheme());
+ EXPECT_EQ("", invalid2.username());
+ EXPECT_EQ("", invalid2.password());
+ EXPECT_EQ("", invalid2.host());
+ EXPECT_EQ("", invalid2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort());
+ EXPECT_EQ("", invalid2.path());
+ EXPECT_EQ("", invalid2.query());
+ EXPECT_EQ("", invalid2.ref());
+}
+
+TEST(GURLTest, Assign) {
+ GURL url(gurl_base::UTF8ToUTF16(
+ "http://user:pass@google.com:99/foo;bar?q=a#ref"));
+
+ GURL url2;
+ url2 = url;
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("http", url2.scheme());
+ EXPECT_EQ("user", url2.username());
+ EXPECT_EQ("pass", url2.password());
+ EXPECT_EQ("google.com", url2.host());
+ EXPECT_EQ("99", url2.port());
+ EXPECT_EQ(99, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ // Assignment of invalid URL should be invalid
+ GURL invalid;
+ GURL invalid2;
+ invalid2 = invalid;
+ EXPECT_FALSE(invalid2.is_valid());
+ EXPECT_EQ("", invalid2.spec());
+ EXPECT_EQ("", invalid2.scheme());
+ EXPECT_EQ("", invalid2.username());
+ EXPECT_EQ("", invalid2.password());
+ EXPECT_EQ("", invalid2.host());
+ EXPECT_EQ("", invalid2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, invalid2.IntPort());
+ EXPECT_EQ("", invalid2.path());
+ EXPECT_EQ("", invalid2.query());
+ EXPECT_EQ("", invalid2.ref());
+}
+
+// This is a regression test for http://crbug.com/309975.
+TEST(GURLTest, SelfAssign) {
+ GURL a("filesystem:http://example.com/temporary/");
+ // This should not crash.
+ a = *&a; // The *& defeats Clang's -Wself-assign warning.
+}
+
+TEST(GURLTest, CopyFileSystem) {
+ GURL url(gurl_base::UTF8ToUTF16(
+ "filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("filesystem:https://google.com:99/t/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("filesystem", url2.scheme());
+ EXPECT_EQ("", url2.username());
+ EXPECT_EQ("", url2.password());
+ EXPECT_EQ("", url2.host());
+ EXPECT_EQ("", url2.port());
+ EXPECT_EQ(PORT_UNSPECIFIED, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ const GURL* inner = url2.inner_url();
+ ASSERT_TRUE(inner);
+ EXPECT_EQ("https", inner->scheme());
+ EXPECT_EQ("", inner->username());
+ EXPECT_EQ("", inner->password());
+ EXPECT_EQ("google.com", inner->host());
+ EXPECT_EQ("99", inner->port());
+ EXPECT_EQ(99, inner->IntPort());
+ EXPECT_EQ("/t", inner->path());
+ EXPECT_EQ("", inner->query());
+ EXPECT_EQ("", inner->ref());
+}
+
+TEST(GURLTest, IsValid) {
+ const char* valid_cases[] = {
+ "http://google.com",
+ "unknown://google.com",
+ "http://user:pass@google.com",
+ "http://google.com:12345",
+ "http://google.com/path",
+ "http://google.com//path",
+ "http://google.com?k=v#fragment",
+ "http://user:pass@google.com:12345/path?k=v#fragment",
+ "http:/path",
+ "http:path",
+ };
+ for (size_t i = 0; i < gurl_base::size(valid_cases); i++) {
+ EXPECT_TRUE(GURL(valid_cases[i]).is_valid())
+ << "Case: " << valid_cases[i];
+ }
+
+ const char* invalid_cases[] = {
+ "http://?k=v",
+ "http:://google.com",
+ "http//google.com",
+ "http://google.com:12three45",
+ "://google.com",
+ "path",
+ };
+ for (size_t i = 0; i < gurl_base::size(invalid_cases); i++) {
+ EXPECT_FALSE(GURL(invalid_cases[i]).is_valid())
+ << "Case: " << invalid_cases[i];
+ }
+}
+
+TEST(GURLTest, ExtraSlashesBeforeAuthority) {
+ // According to RFC3986, the hierarchical part for URI with an authority
+ // must use only two slashes; GURL intentionally just ignores extra slashes
+ // if there are more than 2, and parses the following part as an authority.
+ GURL url("http:///host");
+ EXPECT_EQ("host", url.host());
+ EXPECT_EQ("/", url.path());
+}
+
+// Given an invalid URL, we should still get most of the components.
+TEST(GURLTest, ComponentGettersWorkEvenForInvalidURL) {
+ GURL url("http:google.com:foo");
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("foo", url.port());
+ EXPECT_EQ(PORT_INVALID, url.IntPort());
+ EXPECT_EQ("/", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Resolve) {
+ // The tricky cases for relative URL resolving are tested in the
+ // canonicalizer unit test. Here, we just test that the GURL integration
+ // works properly.
+ struct ResolveCase {
+ const char* base;
+ const char* relative;
+ bool expected_valid;
+ const char* expected;
+ } resolve_cases[] = {
+ {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"},
+ {"http://www.google.com/foo/", "bar", true, "http://www.google.com/foo/bar"},
+ {"http://www.google.com/foo/", "/bar", true, "http://www.google.com/bar"},
+ {"http://www.google.com/foo", "bar", true, "http://www.google.com/bar"},
+ {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"},
+ {"http://www.google.com/", "http://images.\tgoogle.\ncom/\rfoo.html", true, "http://images.google.com/foo.html"},
+ {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
+ {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
+ {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
+ // A non-standard base can be replaced with a standard absolute URL.
+ {"data:blahblah", "http://google.com/", true, "http://google.com/"},
+ {"data:blahblah", "http:google.com", true, "http://google.com/"},
+ // Filesystem URLs have different paths to test.
+ {"filesystem:http://www.google.com/type/", "foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ {"filesystem:http://www.google.com/type/", "../foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(resolve_cases); i++) {
+ // 8-bit code path.
+ GURL input(resolve_cases[i].base);
+ GURL output = input.Resolve(resolve_cases[i].relative);
+ EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+
+ // Wide code path.
+ GURL inputw(gurl_base::UTF8ToUTF16(resolve_cases[i].base));
+ GURL outputw =
+ input.Resolve(gurl_base::UTF8ToUTF16(resolve_cases[i].relative));
+ EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
+ EXPECT_EQ(outputw.SchemeIsFileSystem(), outputw.inner_url() != NULL);
+ }
+}
+
+TEST(GURLTest, GetOrigin) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello,world\");", ""},
+ {"http://user:pass@www.google.com:21/blah#baz",
+ "http://www.google.com:21/"},
+ {"http://user@www.google.com", "http://www.google.com/"},
+ {"http://:pass@www.google.com", "http://www.google.com/"},
+ {"http://:@www.google.com", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temp/foo?q#b",
+ "http://www.google.com/"},
+ {"filesystem:http://user:pass@google.com:21/blah#baz",
+ "http://google.com:21/"},
+ {"blob:null/guid-goes-here", ""},
+ {"blob:http://origin/guid-goes-here", "" /* should be http://origin/ */},
+ };
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ GURL origin = url.GetOrigin();
+ EXPECT_EQ(cases[i].expected, origin.spec());
+ }
+}
+
+TEST(GURLTest, GetAsReferrer) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/blah"},
+ {"http://user@www.google.com", "http://www.google.com/"},
+ {"http://:pass@www.google.com", "http://www.google.com/"},
+ {"http://:@www.google.com", "http://www.google.com/"},
+ {"http://www.google.com/temp/foo?q#b", "http://www.google.com/temp/foo?q"},
+ {"not a url", ""},
+ {"unknown-scheme://foo.html", ""},
+ {"file:///tmp/test.html", ""},
+ {"https://www.google.com", "https://www.google.com/"},
+ };
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ GURL origin = url.GetAsReferrer();
+ EXPECT_EQ(cases[i].expected, origin.spec());
+ }
+}
+
+TEST(GURLTest, GetWithEmptyPath) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello, world\");", ""},
+ {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"},
+ {"filesystem:file:///temporary/bar.html?baz=22", "filesystem:file:///temporary/"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ GURL empty_path = url.GetWithEmptyPath();
+ EXPECT_EQ(cases[i].expected, empty_path.spec());
+ }
+}
+
+TEST(GURLTest, GetWithoutFilename) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ // Common Standard URLs.
+ {"https://www.google.com", "https://www.google.com/"},
+ {"https://www.google.com/", "https://www.google.com/"},
+ {"https://www.google.com/maps.htm", "https://www.google.com/"},
+ {"https://www.google.com/maps/", "https://www.google.com/maps/"},
+ {"https://www.google.com/index.html", "https://www.google.com/"},
+ {"https://www.google.com/index.html?q=maps", "https://www.google.com/"},
+ {"https://www.google.com/index.html#maps/", "https://www.google.com/"},
+ {"https://foo:bar@www.google.com/maps.htm", "https://foo:bar@www.google.com/"},
+ {"https://www.google.com/maps/au/index.html", "https://www.google.com/maps/au/"},
+ {"https://www.google.com/maps/au/north", "https://www.google.com/maps/au/"},
+ {"https://www.google.com/maps/au/north/", "https://www.google.com/maps/au/north/"},
+ {"https://www.google.com/maps/au/index.html?q=maps#fragment/", "https://www.google.com/maps/au/"},
+ {"http://www.google.com:8000/maps/au/index.html?q=maps#fragment/", "http://www.google.com:8000/maps/au/"},
+ {"https://www.google.com/maps/au/north/?q=maps#fragment", "https://www.google.com/maps/au/north/"},
+ {"https://www.google.com/maps/au/north?q=maps#fragment", "https://www.google.com/maps/au/"},
+ // Less common standard URLs.
+ {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"},
+ {"file:///temporary/bar.html?baz=22","file:///temporary/"},
+ {"ftp://foo/test/index.html", "ftp://foo/test/"},
+ {"gopher://foo/test/index.html", "gopher://foo/test/"},
+ {"ws://foo/test/index.html", "ws://foo/test/"},
+ // Non-standard, hierarchical URLs.
+ {"chrome://foo/bar.html", "chrome://foo/"},
+ {"httpa://foo/test/index.html", "httpa://foo/test/"},
+ // Non-standard, non-hierarchical URLs.
+ {"blob:https://foo.bar/test/index.html", ""},
+ {"about:blank", ""},
+ {"data:foobar", ""},
+ {"scheme:opaque_data", ""},
+ // Invalid URLs.
+ {"foobar", ""},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ GURL without_filename = url.GetWithoutFilename();
+ EXPECT_EQ(cases[i].expected, without_filename.spec()) << i;
+ }
+}
+
+TEST(GURLTest, Replacements) {
+ // The URL canonicalizer replacement test will handle most of these case.
+ // The most important thing to do here is to check that the proper
+ // canonicalizer gets called based on the scheme of the input.
+ struct ReplaceCase {
+ const char* base;
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ const char* port;
+ const char* path;
+ const char* query;
+ const char* ref;
+ const char* expected;
+ } replace_cases[] = {
+ {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL,
+ NULL, "/", "", "", "http://www.google.com/"},
+ {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "",
+ "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
+ {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99",
+ "/foo", "search", "ref", "http://www.google.com:99/foo?search#ref"},
+#ifdef WIN32
+ {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "",
+ "c:\\", "", "", "file:///C:/"},
+#endif
+ {"filesystem:http://www.google.com/foo/bar.html?foo#bar", NULL, NULL,
+ NULL, NULL, NULL, "/", "", "", "filesystem:http://www.google.com/foo/"},
+ // Lengthen the URL instead of shortening it, to test creation of
+ // inner_url.
+ {"filesystem:http://www.google.com/foo/", NULL, NULL, NULL, NULL, NULL,
+ "bar.html", "foo", "bar",
+ "filesystem:http://www.google.com/foo/bar.html?foo#bar"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ GURL url(cur.base);
+ GURL::Replacements repl;
+ SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme);
+ SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username);
+ SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password);
+ SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host);
+ SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port);
+ SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path);
+ SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query);
+ SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref);
+ GURL output = url.ReplaceComponents(repl);
+
+ EXPECT_EQ(replace_cases[i].expected, output.spec());
+
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+ if (output.SchemeIsFileSystem()) {
+ // TODO(mmenke): inner_url()->spec() is currently the same as the spec()
+ // for the GURL itself. This should be fixed.
+ // See https://crbug.com/619596
+ EXPECT_EQ(replace_cases[i].expected, output.inner_url()->spec());
+ }
+ }
+}
+
+TEST(GURLTest, ClearFragmentOnDataUrl) {
+ // http://crbug.com/291747 - a data URL may legitimately have trailing
+ // whitespace in the spec after the ref is cleared. Test this does not trigger
+ // the Parsed importing validation GURL_DCHECK in GURL.
+ GURL url(" data: one ? two # three ");
+
+ // By default the trailing whitespace will have been stripped.
+ EXPECT_EQ("data: one ? two # three", url.spec());
+ GURL::Replacements repl;
+ repl.ClearRef();
+ GURL url_no_ref = url.ReplaceComponents(repl);
+
+ EXPECT_EQ("data: one ? two ", url_no_ref.spec());
+
+ // Importing a parsed URL via this constructor overload will retain trailing
+ // whitespace.
+ GURL import_url(url_no_ref.spec(),
+ url_no_ref.parsed_for_possibly_invalid_spec(),
+ url_no_ref.is_valid());
+ EXPECT_EQ(url_no_ref, import_url);
+ EXPECT_EQ(import_url.query(), " two ");
+}
+
+TEST(GURLTest, PathForRequest) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ const char* inner_expected;
+ } cases[] = {
+ {"http://www.google.com", "/", NULL},
+ {"http://www.google.com/", "/", NULL},
+ {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22", NULL},
+ {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html", NULL},
+ {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query", NULL},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query#ref", "/foo/bar.html?query", "/temporary"},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query", "/foo/bar.html?query", "/temporary"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ std::string path_request = url.PathForRequest();
+ EXPECT_EQ(cases[i].expected, path_request);
+ EXPECT_EQ(cases[i].inner_expected == NULL, url.inner_url() == NULL);
+ if (url.inner_url() && cases[i].inner_expected)
+ EXPECT_EQ(cases[i].inner_expected, url.inner_url()->PathForRequest());
+ }
+}
+
+TEST(GURLTest, EffectiveIntPort) {
+ struct PortTest {
+ const char* spec;
+ int expected_int_port;
+ } port_tests[] = {
+ // http
+ {"http://www.google.com/", 80},
+ {"http://www.google.com:80/", 80},
+ {"http://www.google.com:443/", 443},
+
+ // https
+ {"https://www.google.com/", 443},
+ {"https://www.google.com:443/", 443},
+ {"https://www.google.com:80/", 80},
+
+ // ftp
+ {"ftp://www.google.com/", 21},
+ {"ftp://www.google.com:21/", 21},
+ {"ftp://www.google.com:80/", 80},
+
+ // gopher
+ {"gopher://www.google.com/", 70},
+ {"gopher://www.google.com:70/", 70},
+ {"gopher://www.google.com:80/", 80},
+
+ // file - no port
+ {"file://www.google.com/", PORT_UNSPECIFIED},
+ {"file://www.google.com:443/", PORT_UNSPECIFIED},
+
+ // data - no port
+ {"data:www.google.com:90", PORT_UNSPECIFIED},
+ {"data:www.google.com", PORT_UNSPECIFIED},
+
+ // filesystem - no port
+ {"filesystem:http://www.google.com:90/t/foo", PORT_UNSPECIFIED},
+ {"filesystem:file:///t/foo", PORT_UNSPECIFIED},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(port_tests); i++) {
+ GURL url(port_tests[i].spec);
+ EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort());
+ }
+}
+
+TEST(GURLTest, IPAddress) {
+ struct IPTest {
+ const char* spec;
+ bool expected_ip;
+ } ip_tests[] = {
+ {"http://www.google.com/", false},
+ {"http://192.168.9.1/", true},
+ {"http://192.168.9.1.2/", false},
+ {"http://192.168.m.1/", false},
+ {"http://2001:db8::1/", false},
+ {"http://[2001:db8::1]/", true},
+ {"", false},
+ {"some random input!", false},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(ip_tests); i++) {
+ GURL url(ip_tests[i].spec);
+ EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress());
+ }
+}
+
+TEST(GURLTest, HostNoBrackets) {
+ struct TestCase {
+ const char* input;
+ const char* expected_host;
+ const char* expected_plainhost;
+ } cases[] = {
+ {"http://www.google.com", "www.google.com", "www.google.com"},
+ {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"},
+ {"http://[::]/", "[::]", "::"},
+
+ // Don't require a valid URL, but don't crash either.
+ {"http://[]/", "[]", ""},
+ {"http://[x]/", "[x]", "x"},
+ {"http://[x/", "[x", "[x"},
+ {"http://x]/", "x]", "x]"},
+ {"http://[/", "[", "["},
+ {"http://]/", "]", "]"},
+ {"", "", ""},
+ };
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ GURL url(cases[i].input);
+ EXPECT_EQ(cases[i].expected_host, url.host());
+ EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets());
+ EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBracketsPiece());
+ }
+}
+
+TEST(GURLTest, DomainIs) {
+ GURL url_1("http://google.com/foo");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
+
+ // Subdomain and port are ignored.
+ GURL url_2("http://www.google.com:99/foo");
+ EXPECT_TRUE(url_2.DomainIs("google.com"));
+
+ // Different top-level domain.
+ GURL url_3("http://www.google.com.cn/foo");
+ EXPECT_FALSE(url_3.DomainIs("google.com"));
+
+ // Different host name.
+ GURL url_4("http://www.iamnotgoogle.com/foo");
+ EXPECT_FALSE(url_4.DomainIs("google.com"));
+
+ // The input must be lower-cased otherwise DomainIs returns false.
+ GURL url_5("http://www.google.com/foo");
+ EXPECT_FALSE(url_5.DomainIs("Google.com"));
+
+ // If the URL is invalid, DomainIs returns false.
+ GURL invalid_url("google.com");
+ EXPECT_FALSE(invalid_url.is_valid());
+ EXPECT_FALSE(invalid_url.DomainIs("google.com"));
+
+ GURL url_with_escape_chars("https://www.,.test");
+ EXPECT_TRUE(url_with_escape_chars.is_valid());
+ EXPECT_EQ(url_with_escape_chars.host(), "www.%2C.test");
+ EXPECT_TRUE(url_with_escape_chars.DomainIs("%2C.test"));
+}
+
+TEST(GURLTest, DomainIsTerminatingDotBehavior) {
+ // If the host part ends with a dot, it matches input domains
+ // with or without a dot.
+ GURL url_with_dot("http://www.google.com./foo");
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com"));
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com."));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com"));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com."));
+
+ // But, if the host name doesn't end with a dot and the input
+ // domain does, then it's considered to not match.
+ GURL url_without_dot("http://google.com/foo");
+ EXPECT_FALSE(url_without_dot.DomainIs("google.com."));
+
+ // If the URL ends with two dots, it doesn't match.
+ GURL url_with_two_dots("http://www.google.com../foo");
+ EXPECT_FALSE(url_with_two_dots.DomainIs("google.com"));
+}
+
+TEST(GURLTest, DomainIsWithFilesystemScheme) {
+ GURL url_1("filesystem:http://www.google.com:99/foo/");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
+
+ GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/");
+ EXPECT_FALSE(url_2.DomainIs("google.com"));
+}
+
+// Newlines should be stripped from inputs.
+TEST(GURLTest, Newlines) {
+ // Constructor.
+ GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
+ EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
+ EXPECT_FALSE(
+ url_1.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+ // Relative path resolver.
+ GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
+ EXPECT_EQ("http://www.google.com/foo", url_2.spec());
+ EXPECT_FALSE(
+ url_2.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+ // Constructor.
+ GURL url_3(" \t ht\ntp://\twww.goo\rgle.com/as\ndf< \n ");
+ EXPECT_EQ("http://www.google.com/asdf%3C", url_3.spec());
+ EXPECT_TRUE(
+ url_3.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+ // Relative path resolver.
+ GURL url_4 = url_1.Resolve(" \n /fo\to<\r ");
+ EXPECT_EQ("http://www.google.com/foo%3C", url_4.spec());
+ EXPECT_TRUE(
+ url_4.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+ // Note that newlines are NOT stripped from ReplaceComponents.
+}
+
+TEST(GURLTest, IsStandard) {
+ GURL a("http:foo/bar");
+ EXPECT_TRUE(a.IsStandard());
+
+ GURL b("foo:bar/baz");
+ EXPECT_FALSE(b.IsStandard());
+
+ GURL c("foo://bar/baz");
+ EXPECT_FALSE(c.IsStandard());
+
+ GURL d("cid:bar@baz");
+ EXPECT_FALSE(d.IsStandard());
+}
+
+TEST(GURLTest, SchemeIsHTTPOrHTTPS) {
+ EXPECT_TRUE(GURL("http://bar/").SchemeIsHTTPOrHTTPS());
+ EXPECT_TRUE(GURL("HTTPS://BAR").SchemeIsHTTPOrHTTPS());
+ EXPECT_FALSE(GURL("ftp://bar/").SchemeIsHTTPOrHTTPS());
+}
+
+TEST(GURLTest, SchemeIsWSOrWSS) {
+ EXPECT_TRUE(GURL("WS://BAR/").SchemeIsWSOrWSS());
+ EXPECT_TRUE(GURL("wss://bar/").SchemeIsWSOrWSS());
+ EXPECT_FALSE(GURL("http://bar/").SchemeIsWSOrWSS());
+}
+
+TEST(GURLTest, SchemeIsCryptographic) {
+ EXPECT_TRUE(GURL("https://foo.bar.com/").SchemeIsCryptographic());
+ EXPECT_TRUE(GURL("HTTPS://foo.bar.com/").SchemeIsCryptographic());
+ EXPECT_TRUE(GURL("HtTpS://foo.bar.com/").SchemeIsCryptographic());
+
+ EXPECT_TRUE(GURL("wss://foo.bar.com/").SchemeIsCryptographic());
+ EXPECT_TRUE(GURL("WSS://foo.bar.com/").SchemeIsCryptographic());
+ EXPECT_TRUE(GURL("WsS://foo.bar.com/").SchemeIsCryptographic());
+
+ EXPECT_FALSE(GURL("http://foo.bar.com/").SchemeIsCryptographic());
+ EXPECT_FALSE(GURL("ws://foo.bar.com/").SchemeIsCryptographic());
+}
+
+TEST(GURLTest, SchemeIsCryptographicStatic) {
+ EXPECT_TRUE(GURL::SchemeIsCryptographic("https"));
+ EXPECT_TRUE(GURL::SchemeIsCryptographic("wss"));
+ EXPECT_FALSE(GURL::SchemeIsCryptographic("http"));
+ EXPECT_FALSE(GURL::SchemeIsCryptographic("ws"));
+ EXPECT_FALSE(GURL::SchemeIsCryptographic("ftp"));
+}
+
+TEST(GURLTest, SchemeIsBlob) {
+ EXPECT_TRUE(GURL("BLOB://BAR/").SchemeIsBlob());
+ EXPECT_TRUE(GURL("blob://bar/").SchemeIsBlob());
+ EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob());
+}
+
+// Tests that the 'content' of the URL is properly extracted. This can be
+// complex in cases such as multiple schemes (view-source:http:) or for
+// javascript URLs. See GURL::GetContent for more details.
+TEST(GURLTest, ContentForNonStandardURLs) {
+ struct TestCase {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"null", ""},
+ {"not-a-standard-scheme:this is arbitrary content",
+ "this is arbitrary content"},
+
+ // When there are multiple schemes, only the first is excluded from the
+ // content. Note also that for e.g. 'http://', the '//' is part of the
+ // content not the scheme.
+ {"view-source:http://example.com/path", "http://example.com/path"},
+ {"blob:http://example.com/GUID", "http://example.com/GUID"},
+ {"blob://http://example.com/GUID", "//http://example.com/GUID"},
+ {"blob:http://user:password@example.com/GUID",
+ "http://user:password@example.com/GUID"},
+
+ // The octothorpe character ('#') marks the end of the URL content, and
+ // the start of the fragment. It should not be included in the content.
+ {"http://www.example.com/GUID#ref", "www.example.com/GUID"},
+ {"http://me:secret@example.com/GUID/#ref", "me:secret@example.com/GUID/"},
+ {"data:text/html,Question?<div style=\"color: #bad\">idea</div>",
+ "text/html,Question?<div style=\"color: "},
+
+ // TODO(mkwst): This seems like a bug. https://crbug.com/513600
+ {"filesystem:http://example.com/path", "/"},
+
+ // Javascript URLs include '#' symbols in their content.
+ {"javascript:#", "#"},
+ {"javascript:alert('#');", "alert('#');"},
+ };
+
+ for (const auto& test : cases) {
+ GURL url(test.url);
+ EXPECT_EQ(test.expected, url.GetContent()) << test.url;
+ }
+}
+
+// Tests that the URL path is properly extracted for unusual URLs. This can be
+// complex in cases such as multiple schemes (view-source:http:) or when
+// octothorpes ('#') are involved.
+TEST(GURLTest, PathForNonStandardURLs) {
+ struct TestCase {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"null", ""},
+ {"not-a-standard-scheme:this is arbitrary content",
+ "this is arbitrary content"},
+ {"view-source:http://example.com/path", "http://example.com/path"},
+ {"blob:http://example.com/GUID", "http://example.com/GUID"},
+ {"blob://http://example.com/GUID", "//http://example.com/GUID"},
+ {"blob:http://user:password@example.com/GUID",
+ "http://user:password@example.com/GUID"},
+
+ {"http://www.example.com/GUID#ref", "/GUID"},
+ {"http://me:secret@example.com/GUID/#ref", "/GUID/"},
+ {"data:text/html,Question?<div style=\"color: #bad\">idea</div>",
+ "text/html,Question"},
+
+ // TODO(mkwst): This seems like a bug. https://crbug.com/513600
+ {"filesystem:http://example.com/path", "/"},
+ };
+
+ for (const auto& test : cases) {
+ GURL url(test.url);
+ EXPECT_EQ(test.expected, url.path()) << test.url;
+ }
+}
+
+TEST(GURLTest, IsAboutBlank) {
+ const std::string kAboutBlankUrls[] = {"about:blank", "about:blank?foo",
+ "about:blank/#foo",
+ "about:blank?foo#foo"};
+ for (const auto& url : kAboutBlankUrls)
+ EXPECT_TRUE(GURL(url).IsAboutBlank()) << url;
+
+ const std::string kNotAboutBlankUrls[] = {
+ "http:blank", "about:blan", "about://blank",
+ "about:blank/foo", "about://:8000/blank", "about://foo:foo@/blank",
+ "foo@about:blank", "foo:bar@about:blank", "about:blank:8000",
+ "about:blANk"};
+ for (const auto& url : kNotAboutBlankUrls)
+ EXPECT_FALSE(GURL(url).IsAboutBlank()) << url;
+}
+
+TEST(GURLTest, IsAboutSrcdoc) {
+ const std::string kAboutSrcdocUrls[] = {
+ "about:srcdoc", "about:srcdoc/", "about:srcdoc?foo", "about:srcdoc/#foo",
+ "about:srcdoc?foo#foo"};
+ for (const auto& url : kAboutSrcdocUrls)
+ EXPECT_TRUE(GURL(url).IsAboutSrcdoc()) << url;
+
+ const std::string kNotAboutSrcdocUrls[] = {"http:srcdoc",
+ "about:srcdo",
+ "about://srcdoc",
+ "about://srcdoc\\",
+ "about:srcdoc/foo",
+ "about://:8000/srcdoc",
+ "about://foo:foo@/srcdoc",
+ "foo@about:srcdoc",
+ "foo:bar@about:srcdoc",
+ "about:srcdoc:8000",
+ "about:srCDOc"};
+ for (const auto& url : kNotAboutSrcdocUrls)
+ EXPECT_FALSE(GURL(url).IsAboutSrcdoc()) << url;
+}
+
+TEST(GURLTest, EqualsIgnoringRef) {
+ const struct {
+ const char* url_a;
+ const char* url_b;
+ bool are_equals;
+ } kTestCases[] = {
+ // No ref.
+ {"http://a.com", "http://a.com", true},
+ {"http://a.com", "http://b.com", false},
+
+ // Same Ref.
+ {"http://a.com#foo", "http://a.com#foo", true},
+ {"http://a.com#foo", "http://b.com#foo", false},
+
+ // Different Refs.
+ {"http://a.com#foo", "http://a.com#bar", true},
+ {"http://a.com#foo", "http://b.com#bar", false},
+
+ // One has a ref, the other doesn't.
+ {"http://a.com#foo", "http://a.com", true},
+ {"http://a.com#foo", "http://b.com", false},
+
+ // Empty refs.
+ {"http://a.com#", "http://a.com#", true},
+ {"http://a.com#", "http://a.com", true},
+
+ // URLs that differ only by their last character.
+ {"http://aaa", "http://aab", false},
+ {"http://aaa#foo", "http://aab#foo", false},
+
+ // Different size of the part before the ref.
+ {"http://123#a", "http://123456#a", false},
+
+ // Blob URLs
+ {"blob:http://a.com#foo", "blob:http://a.com#foo", true},
+ {"blob:http://a.com#foo", "blob:http://a.com#bar", true},
+ {"blob:http://a.com#foo", "blob:http://b.com#bar", false},
+
+ // Filesystem URLs
+ {"filesystem:http://a.com#foo", "filesystem:http://a.com#foo", true},
+ {"filesystem:http://a.com#foo", "filesystem:http://a.com#bar", true},
+ {"filesystem:http://a.com#foo", "filesystem:http://b.com#bar", false},
+
+ // Data URLs
+ {"data:text/html,a#foo", "data:text/html,a#bar", true},
+ {"data:text/html,a#foo", "data:text/html,a#foo", true},
+ {"data:text/html,a#foo", "data:text/html,b#foo", false},
+ };
+
+ for (const auto& test_case : kTestCases) {
+ SCOPED_TRACE(testing::Message()
+ << std::endl
+ << "url_a = " << test_case.url_a << std::endl
+ << "url_b = " << test_case.url_b << std::endl);
+ // A versus B.
+ EXPECT_EQ(test_case.are_equals,
+ GURL(test_case.url_a).EqualsIgnoringRef(GURL(test_case.url_b)));
+ // B versus A.
+ EXPECT_EQ(test_case.are_equals,
+ GURL(test_case.url_b).EqualsIgnoringRef(GURL(test_case.url_a)));
+ }
+}
+
+TEST(GURLTest, DebugAlias) {
+ GURL url("https://foo.com/bar");
+ DEBUG_ALIAS_FOR_GURL(url_debug_alias, url);
+ EXPECT_STREQ("https://foo.com/bar", url_debug_alias);
+}
+
+} // namespace url
diff --git a/url/origin.cc b/url/origin.cc
new file mode 100644
index 0000000..6eda15e
--- /dev/null
+++ b/url/origin.cc
@@ -0,0 +1,354 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/origin.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+
+#include "polyfills/base/logging.h"
+#include "base/stl_util.h"
+#include "base/strings/strcat.h"
+#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_util.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
+
+namespace url {
+
+Origin::Origin() : nonce_(Nonce()) {}
+
+Origin Origin::Create(const GURL& url) {
+ if (!url.is_valid())
+ return Origin();
+
+ SchemeHostPort tuple;
+
+ if (url.SchemeIsFileSystem()) {
+ tuple = SchemeHostPort(*url.inner_url());
+ } else if (url.SchemeIsBlob()) {
+ // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin
+ // defines the origin as the origin of the URL which results from parsing
+ // the "path", which boils down to everything after the scheme. GURL's
+ // 'GetContent()' gives us exactly that.
+ tuple = SchemeHostPort(GURL(url.GetContent()));
+ } else {
+ tuple = SchemeHostPort(url);
+
+ // It's SchemeHostPort's responsibility to filter out unrecognized schemes;
+ // sanity check that this is happening.
+ GURL_DCHECK(tuple.IsInvalid() || url.IsStandard() ||
+ gurl_base::Contains(GetLocalSchemes(), url.scheme_piece()) ||
+ AllowNonStandardSchemesForAndroidWebView());
+ }
+
+ if (tuple.IsInvalid())
+ return Origin();
+ return Origin(std::move(tuple));
+}
+
+Origin Origin::Resolve(const GURL& url, const Origin& base_origin) {
+ if (url.SchemeIs(kAboutScheme))
+ return base_origin;
+ Origin result = Origin::Create(url);
+ if (!result.opaque())
+ return result;
+ return base_origin.DeriveNewOpaqueOrigin();
+}
+
+Origin::Origin(const Origin& other) = default;
+Origin& Origin::operator=(const Origin& other) = default;
+Origin::Origin(Origin&& other) = default;
+Origin& Origin::operator=(Origin&& other) = default;
+Origin::~Origin() = default;
+
+// static
+gurl_base::Optional<Origin> Origin::UnsafelyCreateTupleOriginWithoutNormalization(
+ gurl_base::StringPiece scheme,
+ gurl_base::StringPiece host,
+ uint16_t port) {
+ SchemeHostPort tuple(scheme.as_string(), host.as_string(), port,
+ SchemeHostPort::CHECK_CANONICALIZATION);
+ if (tuple.IsInvalid())
+ return gurl_base::nullopt;
+ return Origin(std::move(tuple));
+}
+
+// static
+gurl_base::Optional<Origin> Origin::UnsafelyCreateOpaqueOriginWithoutNormalization(
+ gurl_base::StringPiece precursor_scheme,
+ gurl_base::StringPiece precursor_host,
+ uint16_t precursor_port,
+ const Origin::Nonce& nonce) {
+ SchemeHostPort precursor(precursor_scheme.as_string(),
+ precursor_host.as_string(), precursor_port,
+ SchemeHostPort::CHECK_CANONICALIZATION);
+ // For opaque origins, it is okay for the SchemeHostPort to be invalid;
+ // however, this should only arise when the arguments indicate the
+ // canonical representation of the invalid SchemeHostPort.
+ if (precursor.IsInvalid() &&
+ !(precursor_scheme.empty() && precursor_host.empty() &&
+ precursor_port == 0)) {
+ return gurl_base::nullopt;
+ }
+ return Origin(std::move(nonce), std::move(precursor));
+}
+
+// static
+Origin Origin::CreateFromNormalizedTuple(std::string scheme,
+ std::string host,
+ uint16_t port) {
+ SchemeHostPort tuple(std::move(scheme), std::move(host), port,
+ SchemeHostPort::ALREADY_CANONICALIZED);
+ if (tuple.IsInvalid())
+ return Origin();
+ return Origin(std::move(tuple));
+}
+
+// static
+Origin Origin::CreateOpaqueFromNormalizedPrecursorTuple(
+ std::string precursor_scheme,
+ std::string precursor_host,
+ uint16_t precursor_port,
+ const Origin::Nonce& nonce) {
+ SchemeHostPort precursor(std::move(precursor_scheme),
+ std::move(precursor_host), precursor_port,
+ SchemeHostPort::ALREADY_CANONICALIZED);
+ // For opaque origins, it is okay for the SchemeHostPort to be invalid.
+ return Origin(std::move(nonce), std::move(precursor));
+}
+
+std::string Origin::Serialize() const {
+ if (opaque())
+ return "null";
+
+ if (scheme() == kFileScheme)
+ return "file://";
+
+ return tuple_.Serialize();
+}
+
+GURL Origin::GetURL() const {
+ if (opaque())
+ return GURL();
+
+ if (scheme() == kFileScheme)
+ return GURL("file:///");
+
+ return tuple_.GetURL();
+}
+
+gurl_base::Optional<gurl_base::UnguessableToken> Origin::GetNonceForSerialization()
+ const {
+ // TODO(nasko): Consider not making a copy here, but return a reference to
+ // the nonce.
+ return nonce_ ? gurl_base::make_optional(nonce_->token()) : gurl_base::nullopt;
+}
+
+bool Origin::IsSameOriginWith(const Origin& other) const {
+ // scheme/host/port must match, even for opaque origins where |tuple_| holds
+ // the precursor origin.
+ return std::tie(tuple_, nonce_) == std::tie(other.tuple_, other.nonce_);
+}
+
+bool Origin::CanBeDerivedFrom(const GURL& url) const {
+ GURL_DCHECK(url.is_valid());
+
+ // For "no access" schemes, blink's SecurityOrigin will always create an
+ // opaque unique one. However, about: scheme is also registered as such but
+ // does not behave this way, therefore exclude it from this check.
+ if (gurl_base::Contains(url::GetNoAccessSchemes(), url.scheme()) &&
+ !url.SchemeIs(kAboutScheme)) {
+ // If |this| is not opaque, definitely return false as the expectation
+ // is for opaque origin.
+ if (!opaque())
+ return false;
+
+ // And if it is unique opaque origin, it definitely is fine. But if there
+ // is a precursor stored, we should fall through to compare the tuples.
+ if (tuple_.IsInvalid())
+ return true;
+ }
+
+ SchemeHostPort url_tuple;
+
+ // Optimization for the common, success case: Scheme/Host/Port match on the
+ // precursor, and the URL is standard. Opaqueness does not matter as a tuple
+ // origin can always create an opaque tuple origin.
+ if (url.IsStandard()) {
+ // Note: if extra copies of the scheme and host are undesirable, this check
+ // can be implemented using StringPiece comparisons, but it has to account
+ // explicitly checks on port numbers.
+ if (url.SchemeIsFileSystem()) {
+ url_tuple = SchemeHostPort(*url.inner_url());
+ } else {
+ url_tuple = SchemeHostPort(url);
+ }
+ return url_tuple == tuple_;
+
+ // Blob URLs still contain an inner origin, however it is not accessible
+ // through inner_url(), therefore it requires specific case to handle it.
+ } else if (url.SchemeIsBlob()) {
+ // If |this| doesn't contain any precursor information, it is an unique
+ // opaque origin. It is valid case, as any browser-initiated navigation
+ // to about:blank or data: URL will result in a document with such
+ // origin and it is valid for it to create blob: URLs.
+ if (tuple_.IsInvalid())
+ return true;
+
+ url_tuple = SchemeHostPort(GURL(url.GetContent()));
+ return url_tuple == tuple_;
+ }
+
+ // At this point, the URL has non-standard scheme.
+ GURL_DCHECK(!url.IsStandard());
+
+ // All about: URLs (about:blank, about:srcdoc) inherit their origin from
+ // the context which navigated them, which means that they can be in any
+ // type of origin.
+ if (url.SchemeIs(kAboutScheme))
+ return true;
+
+ // All data: URLs commit in opaque origins, therefore |this| must be opaque
+ // if |url| has data: scheme.
+ if (url.SchemeIs(kDataScheme))
+ return opaque();
+
+ // If |this| does not have valid precursor tuple, it is unique opaque origin,
+ // which is what we expect non-standard schemes to get.
+ if (tuple_.IsInvalid())
+ return true;
+
+ // However, when there is precursor present, the schemes must match.
+ return url.scheme() == tuple_.scheme();
+}
+
+bool Origin::DomainIs(gurl_base::StringPiece canonical_domain) const {
+ return !opaque() && url::DomainIs(tuple_.host(), canonical_domain);
+}
+
+bool Origin::operator<(const Origin& other) const {
+ return std::tie(tuple_, nonce_) < std::tie(other.tuple_, other.nonce_);
+}
+
+Origin Origin::DeriveNewOpaqueOrigin() const {
+ return Origin(Nonce(), tuple_);
+}
+
+std::string Origin::GetDebugString() const {
+ // Handle non-opaque origins first, as they are simpler.
+ if (!opaque()) {
+ std::string out = Serialize();
+ if (scheme() == kFileScheme)
+ gurl_base::StrAppend(&out, {" [internally: ", tuple_.Serialize(), "]"});
+ return out;
+ }
+
+ // For opaque origins, log the nonce and precursor as well. Without this,
+ // EXPECT_EQ failures between opaque origins are nearly impossible to
+ // understand.
+ std::string nonce = nonce_->raw_token().is_empty()
+ ? std::string("nonce TBD")
+ : nonce_->raw_token().ToString();
+
+ std::string out = gurl_base::StrCat({Serialize(), " [internally: (", nonce, ")"});
+ if (tuple_.IsInvalid())
+ gurl_base::StrAppend(&out, {" anonymous]"});
+ else
+ gurl_base::StrAppend(&out, {" derived from ", tuple_.Serialize(), "]"});
+ return out;
+}
+
+Origin::Origin(SchemeHostPort tuple) : tuple_(std::move(tuple)) {
+ GURL_DCHECK(!opaque());
+ GURL_DCHECK(!tuple_.IsInvalid());
+}
+
+// Constructs an opaque origin derived from |precursor|.
+Origin::Origin(const Nonce& nonce, SchemeHostPort precursor)
+ : tuple_(std::move(precursor)), nonce_(std::move(nonce)) {
+ GURL_DCHECK(opaque());
+ // |precursor| is retained, but not accessible via scheme()/host()/port().
+ GURL_DCHECK_EQ("", scheme());
+ GURL_DCHECK_EQ("", host());
+ GURL_DCHECK_EQ(0U, port());
+}
+
+std::ostream& operator<<(std::ostream& out, const url::Origin& origin) {
+ out << origin.GetDebugString();
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const url::Origin::Nonce& nonce) {
+ // Subtle: don't let logging trigger lazy-generation of the token value.
+ if (nonce.raw_token().is_empty())
+ return (out << "(nonce TBD)");
+ else
+ return (out << nonce.raw_token());
+}
+
+bool IsSameOriginWith(const GURL& a, const GURL& b) {
+ return Origin::Create(a).IsSameOriginWith(Origin::Create(b));
+}
+
+Origin::Nonce::Nonce() {}
+Origin::Nonce::Nonce(const gurl_base::UnguessableToken& token) : token_(token) {
+ GURL_CHECK(!token_.is_empty());
+}
+
+const gurl_base::UnguessableToken& Origin::Nonce::token() const {
+ // Inspecting the value of a nonce triggers lazy-generation.
+ // TODO(dcheng): UnguessableToken::is_empty should go away -- what sentinel
+ // value to use instead?
+ if (token_.is_empty())
+ token_ = gurl_base::UnguessableToken::Create();
+ return token_;
+}
+
+const gurl_base::UnguessableToken& Origin::Nonce::raw_token() const {
+ return token_;
+}
+
+// Copying a Nonce triggers lazy-generation of the token.
+Origin::Nonce::Nonce(const Origin::Nonce& other) : token_(other.token()) {}
+
+Origin::Nonce& Origin::Nonce::operator=(const Origin::Nonce& other) {
+ // Copying a Nonce triggers lazy-generation of the token.
+ token_ = other.token();
+ return *this;
+}
+
+// Moving a nonce does NOT trigger lazy-generation of the token.
+Origin::Nonce::Nonce(Origin::Nonce&& other) : token_(other.token_) {
+ other.token_ = gurl_base::UnguessableToken(); // Reset |other|.
+}
+
+Origin::Nonce& Origin::Nonce::operator=(Origin::Nonce&& other) {
+ token_ = other.token_;
+ other.token_ = gurl_base::UnguessableToken(); // Reset |other|.
+ return *this;
+}
+
+bool Origin::Nonce::operator<(const Origin::Nonce& other) const {
+ // When comparing, lazy-generation is required of both tokens, so that an
+ // ordering is established.
+ return token() < other.token();
+}
+
+bool Origin::Nonce::operator==(const Origin::Nonce& other) const {
+ // Equality testing doesn't actually require that the tokens be generated.
+ // If the tokens are both zero, equality only holds if they're the same
+ // object.
+ return (other.token_ == token_) && !(token_.is_empty() && (&other != this));
+}
+
+bool Origin::Nonce::operator!=(const Origin::Nonce& other) const {
+ return !(*this == other);
+}
+
+} // namespace url
diff --git a/url/origin.h b/url/origin.h
new file mode 100644
index 0000000..58c9221
--- /dev/null
+++ b/url/origin.h
@@ -0,0 +1,393 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_ORIGIN_H_
+#define URL_ORIGIN_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "polyfills/base/component_export.h"
+#include "polyfills/base/debug/alias.h"
+#include "base/optional.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
+#include "base/unguessable_token.h"
+#include "ipc/ipc_param_traits.h"
+#include "url/scheme_host_port.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_constants.h"
+
+class GURL;
+
+namespace blink {
+class SecurityOrigin;
+} // namespace blink
+
+namespace ipc_fuzzer {
+template <class T>
+struct FuzzTraits;
+} // namespace ipc_fuzzer
+
+namespace mojo {
+template <typename DataViewType, typename T>
+struct StructTraits;
+struct UrlOriginAdapter;
+} // namespace mojo
+
+namespace url {
+
+namespace mojom {
+class OriginDataView;
+} // namespace mojom
+
+// Per https://html.spec.whatwg.org/multipage/origin.html#origin, an origin is
+// either:
+// - a tuple origin of (scheme, host, port) as described in RFC 6454.
+// - an opaque origin with an internal value, and a memory of the tuple origin
+// from which it was derived.
+//
+// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'.
+// If you only need to extract the bits of a URL which are relevant for a
+// network connection, use 'url::SchemeHostPort'.
+//
+// STL;SDR: If you aren't making actual network connections, use 'url::Origin'.
+//
+// This class ought to be used when code needs to determine if two resources
+// are "same-origin", and when a canonical serialization of an origin is
+// required. Note that the canonical serialization of an origin *must not* be
+// used to determine if two resources are same-origin.
+//
+// A tuple origin, like 'SchemeHostPort', is composed of a tuple of (scheme,
+// host, port), but contains a number of additional concepts which make it
+// appropriate for use as a security boundary and access control mechanism
+// between contexts. Two tuple origins are same-origin if the tuples are equal.
+// A tuple origin may also be re-created from its serialization.
+//
+// An opaque origin has an internal globally unique identifier. When creating a
+// new opaque origin from a URL, a fresh globally unique identifier is
+// generated. However, if an opaque origin is copied or moved, the internal
+// globally unique identifier is preserved. Two opaque origins are same-origin
+// iff the globally unique identifiers match. Unlike tuple origins, an opaque
+// origin cannot be re-created from its serialization, which is always the
+// string "null".
+//
+// IMPORTANT: Since opaque origins always serialize as the string "null", it is
+// *never* safe to use the serialization for security checks!
+//
+// A tuple origin and an opaque origin are never same-origin.
+//
+// There are a few subtleties to note:
+//
+// * A default constructed Origin is opaque, with no precursor origin.
+//
+// * Invalid and non-standard GURLs are parsed as opaque origins. This includes
+// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'.
+//
+// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the
+// internals of the URL. That is, 'filesystem:https://example.com/temporary/f'
+// is parsed as ('https', 'example.com', 443).
+//
+// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0),
+// but their behavior may differ from embedder to embedder.
+// TODO(dcheng): This behavior is not consistent with Blink's notion of file
+// URLs, which always creates an opaque origin.
+//
+// * The host component of an IPv6 address includes brackets, just like the URL
+// representation.
+//
+// Usage:
+//
+// * Origins are generally constructed from an already-canonicalized GURL:
+//
+// GURL url("https://example.com/");
+// url::Origin origin = Origin::Create(url);
+// origin.scheme(); // "https"
+// origin.host(); // "example.com"
+// origin.port(); // 443
+// origin.opaque(); // false
+//
+// * To answer the question "Are |this| and |that| "same-origin" with each
+// other?", use |Origin::IsSameOriginWith|:
+//
+// if (this.IsSameOriginWith(that)) {
+// // Amazingness goes here.
+// }
+class COMPONENT_EXPORT(URL) Origin {
+ public:
+ // Creates an opaque Origin with a nonce that is different from all previously
+ // existing origins.
+ Origin();
+
+ // Creates an Origin from |url|, as described at
+ // https://url.spec.whatwg.org/#origin, with the following additions:
+ //
+ // 1. If |url| is invalid or non-standard, an opaque Origin is constructed.
+ // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed
+ // out of everything in the URL which follows the scheme).
+ // 3. 'file' URLs all parse as ("file", "", 0).
+ static Origin Create(const GURL& url);
+
+ // Creates an Origin for the resource |url| as if it were requested
+ // from the context of |base_origin|. If |url| is standard
+ // (in the sense that it embeds a complete origin, like http/https),
+ // this returns the same value as would Create().
+ //
+ // If |url| is "about:blank", this returns a copy of |base_origin|.
+ //
+ // Otherwise, returns a new opaque origin derived from |base_origin|.
+ // In this case, the resulting opaque origin will inherit the tuple
+ // (or precursor tuple) of |base_origin|, but will not be same origin
+ // with |base_origin|, even if |base_origin| is already opaque.
+ static Origin Resolve(const GURL& url, const Origin& base_origin);
+
+ // Copyable and movable.
+ Origin(const Origin&);
+ Origin& operator=(const Origin&);
+ Origin(Origin&&);
+ Origin& operator=(Origin&&);
+
+ // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters
+ // must be valid and canonicalized. Returns nullopt if any parameter is not
+ // canonical, or if all the parameters are empty.
+ //
+ // This constructor should be used in order to pass 'Origin' objects back and
+ // forth over IPC (as transitioning through GURL would risk potentially
+ // dangerous recanonicalization); other potential callers should prefer the
+ // 'GURL'-based constructor.
+ static gurl_base::Optional<Origin> UnsafelyCreateTupleOriginWithoutNormalization(
+ gurl_base::StringPiece scheme,
+ gurl_base::StringPiece host,
+ uint16_t port);
+
+ // Creates an origin without sanity checking that the host is canonicalized.
+ // This should only be used when converting between already normalized types,
+ // and should NOT be used for IPC. Method takes std::strings for use with move
+ // operators to avoid copies.
+ static Origin CreateFromNormalizedTuple(std::string scheme,
+ std::string host,
+ uint16_t port);
+
+ ~Origin();
+
+ // For opaque origins, these return ("", "", 0).
+ const std::string& scheme() const {
+ return !opaque() ? tuple_.scheme() : gurl_base::EmptyString();
+ }
+ const std::string& host() const {
+ return !opaque() ? tuple_.host() : gurl_base::EmptyString();
+ }
+ uint16_t port() const { return !opaque() ? tuple_.port() : 0; }
+
+ bool opaque() const { return nonce_.has_value(); }
+
+ // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with
+ // the addition that all Origins with a 'file' scheme serialize to "file://".
+ std::string Serialize() const;
+
+ // Two non-opaque Origins are "same-origin" if their schemes, hosts, and ports
+ // are exact matches. Two opaque origins are same-origin only if their
+ // internal nonce values match. A non-opaque origin is never same-origin with
+ // an opaque origin.
+ bool IsSameOriginWith(const Origin& other) const;
+ bool operator==(const Origin& other) const { return IsSameOriginWith(other); }
+ bool operator!=(const Origin& other) const {
+ return !IsSameOriginWith(other);
+ }
+
+ // This method returns true for any |url| which if navigated to could result
+ // in an origin compatible with |this|.
+ bool CanBeDerivedFrom(const GURL& url) const;
+
+ // Get the scheme, host, and port from which this origin derives. For
+ // a tuple Origin, this gives the same values as calling scheme(), host()
+ // and port(). For an opaque Origin that was created by calling
+ // Origin::DeriveNewOpaqueOrigin() on a precursor or Origin::Resolve(),
+ // this returns the tuple inherited from the precursor.
+ //
+ // If this Origin is opaque and was created via the default constructor or
+ // Origin::Create(), the precursor origin is unknown.
+ //
+ // Use with great caution: opaque origins should generally not inherit
+ // privileges from the origins they derive from. However, in some cases
+ // (such as restrictions on process placement, or determining the http lock
+ // icon) this information may be relevant to ensure that entering an
+ // opaque origin does not grant privileges initially denied to the original
+ // non-opaque origin.
+ //
+ // This method has a deliberately obnoxious name to prompt caution in its use.
+ const SchemeHostPort& GetTupleOrPrecursorTupleIfOpaque() const {
+ return tuple_;
+ }
+
+ // Efficiently returns what GURL(Serialize()) would without re-parsing the
+ // URL. This can be used for the (rare) times a GURL representation is needed
+ // for an Origin.
+ // Note: The returned URL will not necessarily be serialized to the same value
+ // as the Origin would. The GURL will have an added "/" path for Origins with
+ // valid SchemeHostPorts and file Origins.
+ //
+ // Try not to use this method under normal circumstances, as it loses type
+ // information. Downstream consumers can mistake the returned GURL with a full
+ // URL (e.g. with a path component).
+ GURL GetURL() const;
+
+ // Same as GURL::DomainIs. If |this| origin is opaque, then returns false.
+ bool DomainIs(gurl_base::StringPiece canonical_domain) const;
+
+ // Allows Origin to be used as a key in STL (for example, a std::set or
+ // std::map).
+ bool operator<(const Origin& other) const;
+
+ // Creates a new opaque origin that is guaranteed to be cross-origin to all
+ // currently existing origins. An origin created by this method retains its
+ // identity across copies. Copies are guaranteed to be same-origin to each
+ // other, e.g.
+ //
+ // url::Origin page = Origin::Create(GURL("http://example.com"))
+ // url::Origin a = page.DeriveNewOpaqueOrigin();
+ // url::Origin b = page.DeriveNewOpaqueOrigin();
+ // url::Origin c = a;
+ // url::Origin d = b;
+ //
+ // |a| and |c| are same-origin, since |c| was copied from |a|. |b| and |d| are
+ // same-origin as well, since |d| was copied from |b|. All other combinations
+ // of origins are considered cross-origin, e.g. |a| is cross-origin to |b| and
+ // |d|, |b| is cross-origin to |a| and |c|, |c| is cross-origin to |b| and
+ // |d|, and |d| is cross-origin to |a| and |c|.
+ Origin DeriveNewOpaqueOrigin() const;
+
+ // Creates a string representation of the object that can be used for logging
+ // and debugging. It serializes the internal state, such as the nonce value
+ // and precursor information.
+ std::string GetDebugString() const;
+
+ private:
+ friend class blink::SecurityOrigin;
+ friend class OriginTest;
+ friend struct mojo::UrlOriginAdapter;
+ friend struct ipc_fuzzer::FuzzTraits<Origin>;
+ friend struct mojo::StructTraits<url::mojom::OriginDataView, url::Origin>;
+ friend IPC::ParamTraits<url::Origin>;
+ friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out,
+ const Origin& origin);
+
+ // Origin::Nonce is a wrapper around gurl_base::UnguessableToken that generates
+ // the random value only when the value is first accessed. The lazy generation
+ // allows Origin to be default-constructed quickly, without spending time
+ // in random number generation.
+ //
+ // TODO(nick): Should this optimization move into UnguessableToken, once it no
+ // longer treats the Null case specially?
+ class COMPONENT_EXPORT(URL) Nonce {
+ public:
+ // Creates a nonce to hold a newly-generated UnguessableToken. The actual
+ // token value will be generated lazily.
+ Nonce();
+
+ // Creates a nonce to hold an already-generated UnguessableToken value. This
+ // constructor should only be used for IPC serialization and testing --
+ // regular code should never need to touch the UnguessableTokens directly,
+ // and the default constructor is faster.
+ explicit Nonce(const gurl_base::UnguessableToken& token);
+
+ // Accessor, which lazily initializes the underlying |token_| member.
+ const gurl_base::UnguessableToken& token() const;
+
+ // Do not use in cases where lazy initialization is expected! This
+ // accessor does not initialize the |token_| member.
+ const gurl_base::UnguessableToken& raw_token() const;
+
+ // Copyable and movable. Copying a Nonce triggers lazy-initialization,
+ // moving it does not.
+ Nonce(const Nonce&);
+ Nonce& operator=(const Nonce&);
+ Nonce(Nonce&&);
+ Nonce& operator=(Nonce&&);
+
+ // Note that operator<, used by maps type containers, will trigger |token_|
+ // lazy-initialization. Equality comparisons do not.
+ bool operator<(const Nonce& other) const;
+ bool operator==(const Nonce& other) const;
+ bool operator!=(const Nonce& other) const;
+
+ private:
+ friend class OriginTest;
+
+ // mutable to support lazy generation.
+ mutable gurl_base::UnguessableToken token_;
+ };
+
+ // This needs to be friended within Origin as well, since Nonce is a private
+ // nested class of Origin.
+ friend COMPONENT_EXPORT(URL) std::ostream& operator<<(std::ostream& out,
+ const Nonce& nonce);
+
+ // Creates an origin without sanity checking that the host is canonicalized.
+ // This should only be used when converting between already normalized types,
+ // and should NOT be used for IPC. Method takes std::strings for use with move
+ // operators to avoid copies.
+ static Origin CreateOpaqueFromNormalizedPrecursorTuple(
+ std::string precursor_scheme,
+ std::string precursor_host,
+ uint16_t precursor_port,
+ const Nonce& nonce);
+
+ // Creates an opaque Origin with the identity given by |nonce|, and an
+ // optional precursor origin given by |precursor_scheme|, |precursor_host| and
+ // |precursor_port|. Returns nullopt if any parameter is not canonical. When
+ // the precursor is unknown, the precursor parameters should be ("", "", 0).
+ //
+ // This factory method should be used in order to pass opaque Origin objects
+ // back and forth over IPC (as transitioning through GURL would risk
+ // potentially dangerous recanonicalization).
+ static gurl_base::Optional<Origin> UnsafelyCreateOpaqueOriginWithoutNormalization(
+ gurl_base::StringPiece precursor_scheme,
+ gurl_base::StringPiece precursor_host,
+ uint16_t precursor_port,
+ const Nonce& nonce);
+
+ // Constructs a non-opaque tuple origin. |tuple| must be valid.
+ explicit Origin(SchemeHostPort tuple);
+
+ // Constructs an opaque origin derived from the |precursor| tuple, with the
+ // given |nonce|.
+ Origin(const Nonce& nonce, SchemeHostPort precursor);
+
+ // Get the nonce associated with this origin, if it is opaque. This should be
+ // used only when trying to send an Origin across an IPC pipe.
+ gurl_base::Optional<gurl_base::UnguessableToken> GetNonceForSerialization() const;
+
+ // The tuple is used for both tuple origins (e.g. https://example.com:80), as
+ // well as for opaque origins, where it tracks the tuple origin from which
+ // the opaque origin was initially derived (we call this the "precursor"
+ // origin).
+ SchemeHostPort tuple_;
+
+ // The nonce is used for maintaining identity of an opaque origin. This
+ // nonce is preserved when an opaque origin is copied or moved. An Origin
+ // is considered opaque if and only if |nonce_| holds a value.
+ gurl_base::Optional<Nonce> nonce_;
+};
+
+// Pretty-printers for logging. These expose the internal state of the nonce.
+COMPONENT_EXPORT(URL)
+std::ostream& operator<<(std::ostream& out, const Origin& origin);
+COMPONENT_EXPORT(URL)
+std::ostream& operator<<(std::ostream& out, const Origin::Nonce& origin);
+
+COMPONENT_EXPORT(URL) bool IsSameOriginWith(const GURL& a, const GURL& b);
+
+// DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) copies |origin| into a new
+// stack-allocated variable named |<var_name>|. This helps ensure that the
+// value of |origin| gets preserved in crash dumps.
+#define DEBUG_ALIAS_FOR_ORIGIN(var_name, origin) \
+ DEBUG_ALIAS_FOR_CSTR(var_name, (origin).Serialize().c_str(), 128)
+
+} // namespace url
+
+#endif // URL_ORIGIN_H_
diff --git a/url/origin_unittest.cc b/url/origin_unittest.cc
new file mode 100644
index 0000000..2754f23
--- /dev/null
+++ b/url/origin_unittest.cc
@@ -0,0 +1,866 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "polyfills/base/logging.h"
+#include "base/macros.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+#include "url/origin.h"
+#include "url/url_util.h"
+
+namespace url {
+
+void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) {
+ EXPECT_EQ(a, b);
+ const Parsed& a_parsed = a.parsed_for_possibly_invalid_spec();
+ const Parsed& b_parsed = b.parsed_for_possibly_invalid_spec();
+ EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin);
+ EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len);
+ EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin);
+ EXPECT_EQ(a_parsed.username.len, b_parsed.username.len);
+ EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin);
+ EXPECT_EQ(a_parsed.password.len, b_parsed.password.len);
+ EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin);
+ EXPECT_EQ(a_parsed.host.len, b_parsed.host.len);
+ EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin);
+ EXPECT_EQ(a_parsed.port.len, b_parsed.port.len);
+ EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin);
+ EXPECT_EQ(a_parsed.path.len, b_parsed.path.len);
+ EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin);
+ EXPECT_EQ(a_parsed.query.len, b_parsed.query.len);
+ EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin);
+ EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len);
+}
+
+class OriginTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ // Add two schemes which are local but nonstandard.
+ AddLocalScheme("local-but-nonstandard");
+ AddLocalScheme("also-local-but-nonstandard");
+
+ // Add a scheme that's both local and standard.
+ AddStandardScheme("local-and-standard", SchemeType::SCHEME_WITH_HOST);
+ AddLocalScheme("local-and-standard");
+
+ // Add a scheme that's standard but no-access. We still want these to
+ // form valid SchemeHostPorts, even though they always commit as opaque
+ // origins, so that they can represent the source of the resource even if
+ // it's not committable as a non-opaque origin.
+ AddStandardScheme("standard-but-noaccess", SchemeType::SCHEME_WITH_HOST);
+ AddNoAccessScheme("standard-but-noaccess");
+ }
+ void TearDown() override { url::ResetForTests(); }
+
+ ::testing::AssertionResult DoEqualityComparisons(const url::Origin& a,
+ const url::Origin& b,
+ bool should_compare_equal) {
+ ::testing::AssertionResult failure = ::testing::AssertionFailure();
+ failure << "DoEqualityComparisons failure. Expecting "
+ << (should_compare_equal ? "equality" : "inequality")
+ << " between:\n a\n Which is: " << a
+ << "\n b\n Which is: " << b << "\nThe following check failed: ";
+ if (a.IsSameOriginWith(b) != should_compare_equal)
+ return failure << "a.IsSameOriginWith(b)";
+ if (b.IsSameOriginWith(a) != should_compare_equal)
+ return failure << "b.IsSameOriginWith(a)";
+ if ((a == b) != should_compare_equal)
+ return failure << "(a == b)";
+ if ((b == a) != should_compare_equal)
+ return failure << "(b == a)";
+ if ((b != a) != !should_compare_equal)
+ return failure << "(b != a)";
+ if ((a != b) != !should_compare_equal)
+ return failure << "(a != b)";
+ return ::testing::AssertionSuccess();
+ }
+
+ bool HasNonceTokenBeenInitialized(const url::Origin& origin) {
+ EXPECT_TRUE(origin.opaque());
+ // Avoid calling nonce_.token() here, to not trigger lazy initialization.
+ return !origin.nonce_->token_.is_empty();
+ }
+
+ Origin::Nonce CreateNonce() { return Origin::Nonce(); }
+
+ Origin::Nonce CreateNonce(gurl_base::UnguessableToken nonce) {
+ return Origin::Nonce(nonce);
+ }
+
+ gurl_base::Optional<gurl_base::UnguessableToken> GetNonce(const Origin& origin) {
+ return origin.GetNonceForSerialization();
+ }
+
+ // Wrapper around url::Origin method to expose it to tests.
+ gurl_base::Optional<Origin> UnsafelyCreateOpaqueOriginWithoutNormalization(
+ gurl_base::StringPiece precursor_scheme,
+ gurl_base::StringPiece precursor_host,
+ uint16_t precursor_port,
+ const Origin::Nonce& nonce) {
+ return Origin::UnsafelyCreateOpaqueOriginWithoutNormalization(
+ precursor_scheme, precursor_host, precursor_port, nonce);
+ }
+};
+
+TEST_F(OriginTest, OpaqueOriginComparison) {
+ // A default-constructed Origin should should be cross origin to everything
+ // but itself.
+ url::Origin opaque_a, opaque_b;
+ EXPECT_TRUE(opaque_a.opaque());
+ EXPECT_EQ("", opaque_a.scheme());
+ EXPECT_EQ("", opaque_a.host());
+ EXPECT_EQ(0, opaque_a.port());
+ EXPECT_EQ(SchemeHostPort(), opaque_a.GetTupleOrPrecursorTupleIfOpaque());
+ EXPECT_TRUE(opaque_a.GetTupleOrPrecursorTupleIfOpaque().IsInvalid());
+
+ EXPECT_TRUE(opaque_b.opaque());
+ EXPECT_EQ("", opaque_b.scheme());
+ EXPECT_EQ("", opaque_b.host());
+ EXPECT_EQ(0, opaque_b.port());
+ EXPECT_EQ(SchemeHostPort(), opaque_b.GetTupleOrPrecursorTupleIfOpaque());
+ EXPECT_TRUE(opaque_b.GetTupleOrPrecursorTupleIfOpaque().IsInvalid());
+
+ // Two default-constructed Origins should always be cross origin to each
+ // other.
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true));
+
+ // The streaming operator should not trigger lazy initialization to the token.
+ std::ostringstream stream;
+ stream << opaque_a;
+ EXPECT_STREQ("null [internally: (nonce TBD) anonymous]",
+ stream.str().c_str());
+ EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a));
+
+ // None of the operations thus far should have triggered lazy-generation of
+ // the UnguessableToken. Copying an origin, however, should trigger this.
+ EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a));
+ EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_b));
+ opaque_b = opaque_a;
+
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a));
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, true));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true));
+
+ // Move-initializing to a fresh Origin should restore the lazy initialization.
+ opaque_a = url::Origin();
+ EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a));
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true));
+
+ // Comparing two opaque Origins with matching SchemeHostPorts should trigger
+ // lazy initialization.
+ EXPECT_FALSE(HasNonceTokenBeenInitialized(opaque_a));
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b));
+ bool should_swap = opaque_b < opaque_a;
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_a));
+ EXPECT_TRUE(HasNonceTokenBeenInitialized(opaque_b));
+
+ if (should_swap)
+ std::swap(opaque_a, opaque_b);
+ EXPECT_LT(opaque_a, opaque_b);
+ EXPECT_FALSE(opaque_b < opaque_a);
+
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_b, false));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_b, opaque_b, true));
+ EXPECT_TRUE(DoEqualityComparisons(opaque_a, opaque_a, true));
+
+ EXPECT_LT(opaque_a, url::Origin::Create(GURL("http://www.google.com")));
+ EXPECT_LT(opaque_b, url::Origin::Create(GURL("http://www.google.com")));
+
+ EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:blank"), opaque_b));
+ EXPECT_EQ(opaque_b, url::Origin::Resolve(GURL("about:srcdoc"), opaque_b));
+ EXPECT_EQ(opaque_b,
+ url::Origin::Resolve(GURL("about:blank?hello#whee"), opaque_b));
+
+ const char* const urls[] = {
+ "data:text/html,Hello!",
+ "javascript:alert(1)",
+ "about:blank",
+ "file://example.com:443/etc/passwd",
+ "unknown-scheme:foo",
+ "unknown-scheme://bar",
+ "http",
+ "http:",
+ "http:/",
+ "http://",
+ "http://:",
+ "http://:1",
+ "yay",
+ "http::///invalid.example.com/",
+ "blob:null/foo", // blob:null (actually a valid URL)
+ "blob:data:foo", // blob + data (which is nonstandard)
+ "blob:about://blank/", // blob + about (which is nonstandard)
+ "blob:about:blank/", // blob + about (which is nonstandard)
+ "filesystem:http://example.com/", // Invalid (missing /type/)
+ "filesystem:local-but-nonstandard:baz./type/", // fs requires standard
+ "filesystem:local-but-nonstandard://hostname/type/",
+ "filesystem:unknown-scheme://hostname/type/",
+ "local-but-nonstandar:foo", // Prefix of registered scheme.
+ "but-nonstandard:foo", // Suffix of registered scheme.
+ "local-and-standard:", // Standard scheme needs a hostname.
+ "standard-but-noaccess:", // Standard scheme needs a hostname.
+ "blob:blob:http://www.example.com/guid-goes-here", // Double blob.
+ };
+
+ for (auto* test_url : urls) {
+ SCOPED_TRACE(test_url);
+ GURL url(test_url);
+ const url::Origin opaque_origin;
+
+ // Opaque origins returned by Origin::Create().
+ {
+ Origin origin = Origin::Create(url);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.opaque());
+ // An origin is always same-origin with itself.
+ EXPECT_EQ(origin, origin);
+ EXPECT_NE(origin, url::Origin());
+ EXPECT_EQ(SchemeHostPort(), origin.GetTupleOrPrecursorTupleIfOpaque());
+ // A copy of |origin| should be same-origin as well.
+ Origin origin_copy = origin;
+ EXPECT_EQ("", origin_copy.scheme());
+ EXPECT_EQ("", origin_copy.host());
+ EXPECT_EQ(0, origin_copy.port());
+ EXPECT_TRUE(origin_copy.opaque());
+ EXPECT_EQ(origin, origin_copy);
+ // And it should always be cross-origin to another opaque Origin.
+ EXPECT_NE(origin, opaque_origin);
+ // Re-creating from the URL should also be cross-origin.
+ EXPECT_NE(origin, Origin::Create(url));
+
+ ExpectParsedUrlsEqual(GURL(origin.Serialize()), origin.GetURL());
+ }
+ }
+}
+
+TEST_F(OriginTest, ConstructFromTuple) {
+ struct TestCases {
+ const char* const scheme;
+ const char* const host;
+ const uint16_t port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ };
+
+ for (const auto& test_case : cases) {
+ testing::Message scope_message;
+ scope_message << test_case.scheme << "://" << test_case.host << ":"
+ << test_case.port;
+ SCOPED_TRACE(scope_message);
+ Origin origin = Origin::CreateFromNormalizedTuple(
+ test_case.scheme, test_case.host, test_case.port);
+
+ EXPECT_EQ(test_case.scheme, origin.scheme());
+ EXPECT_EQ(test_case.host, origin.host());
+ EXPECT_EQ(test_case.port, origin.port());
+ }
+}
+
+TEST_F(OriginTest, ConstructFromGURL) {
+ Origin different_origin =
+ Origin::Create(GURL("https://not-in-the-list.test/"));
+
+ struct TestCases {
+ const char* const url;
+ const char* const expected_scheme;
+ const char* const expected_host;
+ const uint16_t expected_port;
+ } cases[] = {
+ // IP Addresses
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+ {"http://1/", "http", "0.0.0.1", 80},
+ {"http://1:1/", "http", "0.0.0.1", 1},
+ {"http://3232237825/", "http", "192.168.9.1", 80},
+
+ // Punycode
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"blob:http://☃.net/", "http", "xn--n3h.net", 80},
+
+ // Generic URLs
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"http://user:pass@example.com/", "http", "example.com", 80},
+ {"http://example.com:123/?query", "http", "example.com", 123},
+ {"https://example.com/#1234", "https", "example.com", 443},
+ {"https://u:p@example.com:123/?query#1234", "https", "example.com", 123},
+
+ // Registered URLs
+ {"ftp://example.com/", "ftp", "example.com", 21},
+ {"gopher://example.com/", "gopher", "example.com", 70},
+ {"ws://example.com/", "ws", "example.com", 80},
+ {"wss://example.com/", "wss", "example.com", 443},
+ {"wss://user:pass@example.com/", "wss", "example.com", 443},
+
+ // Scheme (registered in SetUp()) that's both local and standard.
+ // TODO: Is it really appropriate to do network-host canonicalization of
+ // schemes without ports?
+ {"local-and-standard:20", "local-and-standard", "0.0.0.20", 0},
+ {"local-and-standard:20.", "local-and-standard", "0.0.0.20", 0},
+ {"local-and-standard:↑↑↓↓←→←→ba.↑↑↓↓←→←→ba.0.bg", "local-and-standard",
+ "xn--ba-rzuadaibfa.xn--ba-rzuadaibfa.0.bg", 0},
+ {"local-and-standard:foo", "local-and-standard", "foo", 0},
+ {"local-and-standard://bar:20", "local-and-standard", "bar", 0},
+ {"local-and-standard:baz.", "local-and-standard", "baz.", 0},
+ {"local-and-standard:baz..", "local-and-standard", "baz..", 0},
+ {"local-and-standard:baz..bar", "local-and-standard", "baz..bar", 0},
+ {"local-and-standard:baz...", "local-and-standard", "baz...", 0},
+
+ // Scheme (registered in SetUp()) that's local but nonstandard. These
+ // always have empty hostnames, but are allowed to be url::Origins.
+ {"local-but-nonstandard:", "local-but-nonstandard", "", 0},
+ {"local-but-nonstandard:foo", "local-but-nonstandard", "", 0},
+ {"local-but-nonstandard://bar", "local-but-nonstandard", "", 0},
+ {"also-local-but-nonstandard://bar", "also-local-but-nonstandard", "", 0},
+
+ // Scheme (registered in SetUp()) that's standard but marked as noaccess.
+ // url::Origin doesn't currently take the noaccess property into account,
+ // so these aren't expected to result in opaque origins.
+ {"standard-but-noaccess:foo", "standard-but-noaccess", "foo", 0},
+ {"standard-but-noaccess://bar", "standard-but-noaccess", "bar", 0},
+
+ // file: URLs
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+
+ // Filesystem:
+ {"filesystem:http://example.com/type/", "http", "example.com", 80},
+ {"filesystem:http://example.com:123/type/", "http", "example.com", 123},
+ {"filesystem:https://example.com/type/", "https", "example.com", 443},
+ {"filesystem:https://example.com:123/type/", "https", "example.com", 123},
+ {"filesystem:local-and-standard:baz./type/", "local-and-standard", "baz.",
+ 0},
+
+ // Blob:
+ {"blob:http://example.com/guid-goes-here", "http", "example.com", 80},
+ {"blob:http://example.com:123/guid-goes-here", "http", "example.com",
+ 123},
+ {"blob:https://example.com/guid-goes-here", "https", "example.com", 443},
+ {"blob:http://u:p@example.com/guid-goes-here", "http", "example.com", 80},
+
+ // Gopher:
+ {"gopher://8u.9.Vx6", "gopher", "8u.9.vx6", 70},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ Origin origin = Origin::Create(url);
+ EXPECT_EQ(test_case.expected_scheme, origin.scheme());
+ EXPECT_EQ(test_case.expected_host, origin.host());
+ EXPECT_EQ(test_case.expected_port, origin.port());
+ EXPECT_FALSE(origin.opaque());
+ EXPECT_EQ(origin, origin);
+ EXPECT_NE(different_origin, origin);
+ EXPECT_NE(origin, different_origin);
+ EXPECT_EQ(origin, Origin::Resolve(GURL("about:blank"), origin));
+ EXPECT_EQ(origin, Origin::Resolve(GURL("about:blank?bar#foo"), origin));
+
+ ExpectParsedUrlsEqual(GURL(origin.Serialize()), origin.GetURL());
+
+ url::Origin derived_opaque =
+ Origin::Resolve(GURL("about:blank?bar#foo"), origin)
+ .DeriveNewOpaqueOrigin();
+ EXPECT_TRUE(derived_opaque.opaque());
+ EXPECT_NE(origin, derived_opaque);
+ EXPECT_FALSE(derived_opaque.GetTupleOrPrecursorTupleIfOpaque().IsInvalid());
+ EXPECT_EQ(origin.GetTupleOrPrecursorTupleIfOpaque(),
+ derived_opaque.GetTupleOrPrecursorTupleIfOpaque());
+ EXPECT_EQ(derived_opaque, derived_opaque);
+
+ url::Origin derived_opaque_via_data_url =
+ Origin::Resolve(GURL("data:text/html,baz"), origin);
+ EXPECT_TRUE(derived_opaque_via_data_url.opaque());
+ EXPECT_NE(origin, derived_opaque_via_data_url);
+ EXPECT_FALSE(derived_opaque_via_data_url.GetTupleOrPrecursorTupleIfOpaque()
+ .IsInvalid());
+ EXPECT_EQ(origin.GetTupleOrPrecursorTupleIfOpaque(),
+ derived_opaque_via_data_url.GetTupleOrPrecursorTupleIfOpaque());
+ EXPECT_NE(derived_opaque, derived_opaque_via_data_url);
+ EXPECT_NE(derived_opaque_via_data_url, derived_opaque);
+ EXPECT_NE(derived_opaque.DeriveNewOpaqueOrigin(), derived_opaque);
+ EXPECT_EQ(derived_opaque_via_data_url, derived_opaque_via_data_url);
+ }
+}
+
+TEST_F(OriginTest, Serialization) {
+ struct TestCases {
+ const char* const url;
+ const char* const expected;
+ const char* const expected_log;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://", "file:// [internally: file://]"},
+ {"file://example.com/etc/passwd", "file://",
+ "file:// [internally: file://example.com]"},
+ {"data:,", "null", "null [internally: (nonce TBD) anonymous]"},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ Origin origin = Origin::Create(url);
+ std::string serialized = origin.Serialize();
+ ExpectParsedUrlsEqual(GURL(serialized), origin.GetURL());
+
+ EXPECT_EQ(test_case.expected, serialized);
+
+ // The '<<' operator sometimes produces additional information.
+ std::stringstream out;
+ out << origin;
+ if (test_case.expected_log)
+ EXPECT_EQ(test_case.expected_log, out.str());
+ else
+ EXPECT_EQ(test_case.expected, out.str());
+ }
+}
+
+TEST_F(OriginTest, Comparison) {
+ // These URLs are arranged in increasing order:
+ const char* const urls[] = {
+ "data:uniqueness", "http://a:80", "http://b:80",
+ "https://a:80", "https://b:80", "http://a:81",
+ "http://b:81", "https://a:81", "https://b:81",
+ };
+ // Validate the comparison logic still works when creating a canonical origin,
+ // when any created opaque origins contain a nonce.
+ {
+ // Pre-create the origins, as the internal nonce for unique origins changes
+ // with each freshly-constructed Origin (that's not copied).
+ std::vector<Origin> origins;
+ for (const auto* test_url : urls)
+ origins.push_back(Origin::Create(GURL(test_url)));
+ for (size_t i = 0; i < origins.size(); i++) {
+ const Origin& current = origins[i];
+ for (size_t j = i; j < origins.size(); j++) {
+ const Origin& to_compare = origins[j];
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
+ }
+}
+
+TEST_F(OriginTest, UnsafelyCreate) {
+ struct TestCase {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message()
+ << test.scheme << "://" << test.host << ":" << test.port);
+ gurl_base::Optional<url::Origin> origin =
+ url::Origin::UnsafelyCreateTupleOriginWithoutNormalization(
+ test.scheme, test.host, test.port);
+ ASSERT_TRUE(origin);
+ EXPECT_EQ(test.scheme, origin->scheme());
+ EXPECT_EQ(test.host, origin->host());
+ EXPECT_EQ(test.port, origin->port());
+ EXPECT_FALSE(origin->opaque());
+ EXPECT_TRUE(origin->IsSameOriginWith(*origin));
+
+ ExpectParsedUrlsEqual(GURL(origin->Serialize()), origin->GetURL());
+
+ gurl_base::UnguessableToken nonce = gurl_base::UnguessableToken::Create();
+ gurl_base::Optional<url::Origin> opaque_origin =
+ UnsafelyCreateOpaqueOriginWithoutNormalization(
+ test.scheme, test.host, test.port, CreateNonce(nonce));
+ ASSERT_TRUE(opaque_origin);
+ EXPECT_TRUE(opaque_origin->opaque());
+ EXPECT_FALSE(*opaque_origin == origin);
+ EXPECT_EQ(opaque_origin->GetTupleOrPrecursorTupleIfOpaque(),
+ origin->GetTupleOrPrecursorTupleIfOpaque());
+ EXPECT_EQ(opaque_origin,
+ UnsafelyCreateOpaqueOriginWithoutNormalization(
+ test.scheme, test.host, test.port, CreateNonce(nonce)));
+ EXPECT_FALSE(*opaque_origin == origin->DeriveNewOpaqueOrigin());
+ }
+}
+
+TEST_F(OriginTest, UnsafelyCreateUniqueOnInvalidInput) {
+ url::AddStandardScheme("host-only", url::SCHEME_WITH_HOST);
+ url::AddStandardScheme("host-port-only", url::SCHEME_WITH_HOST_AND_PORT);
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port = 80;
+ } cases[] = {{"", "", 33},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"data", "example.com"},
+ {"http", "☃.net"},
+ {"http\nmore", "example.com"},
+ {"http\rmore", "example.com"},
+ {"http\n", "example.com"},
+ {"http\r", "example.com"},
+ {"http", "example.com\nnot-example.com"},
+ {"http", "example.com\rnot-example.com"},
+ {"http", "example.com\n"},
+ {"http", "example.com\r"},
+ {"http", "example.com", 0},
+ {"unknown-scheme", "example.com"},
+ {"host-only", "\r", 0},
+ {"host-only", "example.com", 22},
+ {"host-port-only", "example.com", 0},
+ {"file", ""}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message()
+ << test.scheme << "://" << test.host << ":" << test.port);
+ EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization(
+ test.scheme, test.host, test.port, CreateNonce()));
+ EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization(
+ test.scheme, test.host, test.port));
+ }
+
+ // An empty scheme/host/port tuple is not a valid tuple origin.
+ EXPECT_FALSE(
+ url::Origin::UnsafelyCreateTupleOriginWithoutNormalization("", "", 0));
+
+ // Opaque origins with unknown precursors are allowed.
+ gurl_base::UnguessableToken token = gurl_base::UnguessableToken::Create();
+ gurl_base::Optional<url::Origin> anonymous_opaque =
+ UnsafelyCreateOpaqueOriginWithoutNormalization("", "", 0,
+ CreateNonce(token));
+ ASSERT_TRUE(anonymous_opaque)
+ << "An invalid tuple is a valid input to "
+ << "UnsafelyCreateOpaqueOriginWithoutNormalization, so long as it is "
+ << "the canonical form of the invalid tuple.";
+ EXPECT_TRUE(anonymous_opaque->opaque());
+ EXPECT_EQ(GetNonce(anonymous_opaque.value()), token);
+ EXPECT_EQ(anonymous_opaque->GetTupleOrPrecursorTupleIfOpaque(),
+ url::SchemeHostPort());
+}
+
+TEST_F(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) {
+ struct TestCases {
+ gurl_base::StringPiece scheme;
+ gurl_base::StringPiece host;
+ uint16_t port = 80;
+ } cases[] = {{{"http\0more", 9}, {"example.com", 11}},
+ {{"http\0", 5}, {"example.com", 11}},
+ {{"\0http", 5}, {"example.com", 11}},
+ {{"http"}, {"example.com\0not-example.com", 27}},
+ {{"http"}, {"example.com\0", 12}},
+ {{"http"}, {"\0example.com", 12}},
+ {{""}, {"\0", 1}, 0},
+ {{"\0", 1}, {""}, 0}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message()
+ << test.scheme << "://" << test.host << ":" << test.port);
+ EXPECT_FALSE(url::Origin::UnsafelyCreateTupleOriginWithoutNormalization(
+ test.scheme, test.host, test.port));
+ EXPECT_FALSE(UnsafelyCreateOpaqueOriginWithoutNormalization(
+ test.scheme, test.host, test.port, CreateNonce()));
+ }
+}
+
+TEST_F(OriginTest, DomainIs) {
+ const struct {
+ const char* url;
+ const char* lower_ascii_domain;
+ bool expected_domain_is;
+ } kTestCases[] = {
+ {"http://google.com/foo", "google.com", true},
+ {"http://www.google.com:99/foo", "google.com", true},
+ {"http://www.google.com.cn/foo", "google.com", false},
+ {"http://www.google.comm", "google.com", false},
+ {"http://www.iamnotgoogle.com/foo", "google.com", false},
+ {"http://www.google.com/foo", "Google.com", false},
+
+ // If the host ends with a dot, it matches domains with or without a dot.
+ {"http://www.google.com./foo", "google.com", true},
+ {"http://www.google.com./foo", "google.com.", true},
+ {"http://www.google.com./foo", ".com", true},
+ {"http://www.google.com./foo", ".com.", true},
+
+ // But, if the host doesn't end with a dot and the input domain does, then
+ // it's considered to not match.
+ {"http://google.com/foo", "google.com.", false},
+
+ // If the host ends with two dots, it doesn't match.
+ {"http://www.google.com../foo", "google.com", false},
+
+ // Filesystem scheme.
+ {"filesystem:http://www.google.com:99/foo/", "google.com", true},
+ {"filesystem:http://www.iamnotgoogle.com/foo/", "google.com", false},
+
+ // File scheme.
+ {"file:///home/user/text.txt", "", false},
+ {"file:///home/user/text.txt", "txt", false},
+ };
+
+ for (const auto& test_case : kTestCases) {
+ SCOPED_TRACE(testing::Message() << "(url, domain): (" << test_case.url
+ << ", " << test_case.lower_ascii_domain
+ << ")");
+ GURL url(test_case.url);
+ ASSERT_TRUE(url.is_valid());
+ Origin origin = Origin::Create(url);
+
+ EXPECT_EQ(test_case.expected_domain_is,
+ origin.DomainIs(test_case.lower_ascii_domain));
+ EXPECT_FALSE(
+ origin.DeriveNewOpaqueOrigin().DomainIs(test_case.lower_ascii_domain));
+ }
+
+ // If the URL is invalid, DomainIs returns false.
+ GURL invalid_url("google.com");
+ ASSERT_FALSE(invalid_url.is_valid());
+ EXPECT_FALSE(Origin::Create(invalid_url).DomainIs("google.com"));
+
+ // Unique origins.
+ EXPECT_FALSE(Origin().DomainIs(""));
+ EXPECT_FALSE(Origin().DomainIs("com"));
+}
+
+TEST_F(OriginTest, DebugAlias) {
+ Origin origin1 = Origin::Create(GURL("https://foo.com/bar"));
+ DEBUG_ALIAS_FOR_ORIGIN(origin1_debug_alias, origin1);
+ EXPECT_STREQ("https://foo.com", origin1_debug_alias);
+}
+
+TEST_F(OriginTest, NonStandardScheme) {
+ Origin origin = Origin::Create(GURL("cow://"));
+ EXPECT_TRUE(origin.opaque());
+}
+TEST_F(OriginTest, NonStandardSchemeWithAndroidWebViewHack) {
+ EnableNonStandardSchemesForAndroidWebView();
+ Origin origin = Origin::Create(GURL("cow://"));
+ EXPECT_FALSE(origin.opaque());
+ EXPECT_EQ("cow", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ ResetForTests();
+}
+
+TEST_F(OriginTest, CanBeDerivedFrom) {
+ Origin opaque_unique_origin = Origin();
+
+ Origin regular_origin = Origin::Create(GURL("https://a.com/"));
+ Origin opaque_precursor_origin = regular_origin.DeriveNewOpaqueOrigin();
+
+ Origin file_origin = Origin::Create(GURL("file:///foo/bar"));
+ Origin file_opaque_precursor_origin = file_origin.DeriveNewOpaqueOrigin();
+ Origin file_host_origin = Origin::Create(GURL("file://a.com/foo/bar"));
+ Origin file_host_opaque_precursor_origin =
+ file_host_origin.DeriveNewOpaqueOrigin();
+
+ Origin non_standard_scheme_origin =
+ Origin::Create(GURL("non-standard-scheme:foo"));
+ Origin non_standard_opaque_precursor_origin =
+ non_standard_scheme_origin.DeriveNewOpaqueOrigin();
+
+ // Also, add new standard scheme that is local to the test.
+ AddStandardScheme("new-standard", SchemeType::SCHEME_WITH_HOST);
+ Origin new_standard_origin = Origin::Create(GURL("new-standard://host/"));
+ Origin new_standard_opaque_precursor_origin =
+ new_standard_origin.DeriveNewOpaqueOrigin();
+
+ // No access schemes always get unique opaque origins.
+ Origin no_access_origin =
+ Origin::Create(GURL("standard-but-noaccess://b.com"));
+ Origin no_access_opaque_precursor_origin =
+ no_access_origin.DeriveNewOpaqueOrigin();
+
+ Origin local_non_standard_origin =
+ Origin::Create(GURL("local-but-nonstandard://a.com"));
+ Origin local_non_standard_opaque_precursor_origin =
+ local_non_standard_origin.DeriveNewOpaqueOrigin();
+
+ // Call origin.CanBeDerivedFrom(url) for each of the following test cases
+ // and ensure that it returns |expected_value|
+ const struct {
+ const char* url;
+ Origin* origin;
+ bool expected_value;
+ } kTestCases[] = {
+ {"https://a.com", ®ular_origin, true},
+ // Web URL can commit in an opaque origin with precursor information.
+ // Example: iframe sandbox navigated to a.com.
+ {"https://a.com", &opaque_precursor_origin, true},
+ // URL that comes from the web can never commit in an opaque unique
+ // origin. It must have precursor information.
+ {"https://a.com", &opaque_unique_origin, false},
+
+ // Cross-origin URLs should never work.
+ {"https://b.com", ®ular_origin, false},
+ {"https://b.com", &opaque_precursor_origin, false},
+
+ // data: URL can never commit in a regular, non-opaque origin.
+ {"data:text/html,foo", ®ular_origin, false},
+ // This is the default case: data: URLs commit in opaque origin carrying
+ // precursor information for the origin that created them.
+ {"data:text/html,foo", &opaque_precursor_origin, true},
+ // Browser-initiated navigations can result in data: URL committing in
+ // opaque unique origin.
+ {"data:text/html,foo", &opaque_unique_origin, true},
+
+ // about:blank can commit in regular origin (default case for iframes).
+ {"about:blank", ®ular_origin, true},
+ // This can happen if data: URL that originated at a.com creates an
+ // about:blank iframe.
+ {"about:blank", &opaque_precursor_origin, true},
+ // Browser-initiated navigations can result in about:blank URL committing
+ // in opaque unique origin.
+ {"about:blank", &opaque_unique_origin, true},
+
+ // Default behavior of srcdoc is to inherit the origin of the parent
+ // document.
+ {"about:srcdoc", ®ular_origin, true},
+ // This happens for sandboxed srcdoc iframe.
+ {"about:srcdoc", &opaque_precursor_origin, true},
+ // This can happen with browser-initiated navigation to about:blank or
+ // data: URL, which in turn add srcdoc iframe.
+ {"about:srcdoc", &opaque_unique_origin, true},
+
+ // Just like srcdoc, blob: URLs can be created in all the cases.
+ {"blob:https://a.com/foo", ®ular_origin, true},
+ {"blob:https://a.com/foo", &opaque_precursor_origin, true},
+ {"blob:https://a.com/foo", &opaque_unique_origin, true},
+
+ {"filesystem:https://a.com/foo", ®ular_origin, true},
+ {"filesystem:https://a.com/foo", &opaque_precursor_origin, true},
+ // Unlike blob: URLs, filesystem: ones cannot be created in an unique
+ // opaque origin.
+ {"filesystem:https://a.com/foo", &opaque_unique_origin, false},
+
+ // file: URLs cannot result in regular web origins, regardless of
+ // opaqueness.
+ {"file:///etc/passwd", ®ular_origin, false},
+ {"file:///etc/passwd", &opaque_precursor_origin, false},
+ // However, they can result in regular file: origin and an opaque one
+ // containing another file: origin as precursor.
+ {"file:///etc/passwd", &file_origin, true},
+ {"file:///etc/passwd", &file_opaque_precursor_origin, true},
+ // It should not be possible to get an opaque unique origin for file:
+ // as it is a standard scheme and will always result in a tuple origin
+ // or will always be derived by other origin.
+ // Note: file:// URLs should become unique opaque origins at some point.
+ {"file:///etc/passwd", &opaque_unique_origin, false},
+
+ // The same set as above, but including a host.
+ {"file://a.com/etc/passwd", ®ular_origin, false},
+ {"file://a.com/etc/passwd", &opaque_precursor_origin, false},
+ {"file://a.com/etc/passwd", &file_host_origin, true},
+ {"file://a.com/etc/passwd", &file_host_opaque_precursor_origin, true},
+ {"file://a.com/etc/passwd", &opaque_unique_origin, false},
+
+ // Locally registered standard scheme should behave the same way
+ // as built-in standard schemes.
+ {"new-standard://host/foo", &new_standard_origin, true},
+ {"new-standard://host/foo", &new_standard_opaque_precursor_origin, true},
+ {"new-standard://host/foo", &opaque_unique_origin, false},
+ {"new-standard://host2/foo", &new_standard_origin, false},
+ {"new-standard://host2/foo", &new_standard_opaque_precursor_origin,
+ false},
+
+ // A non-standard scheme should never commit in an standard origin or
+ // opaque origin with standard precursor information.
+ {"non-standard-scheme://a.com/foo", ®ular_origin, false},
+ {"non-standard-scheme://a.com/foo", &opaque_precursor_origin, false},
+ // However, it should be fine to commit in unique opaque origins or in its
+ // own origin.
+ // Note: since non-standard scheme URLs don't parse out anything
+ // but the scheme, using a random different hostname here would work.
+ {"non-standard-scheme://b.com/foo2", &opaque_unique_origin, true},
+ {"non-standard-scheme://b.com/foo3", &non_standard_scheme_origin, true},
+ {"non-standard-scheme://b.com/foo4",
+ &non_standard_opaque_precursor_origin, true},
+
+ // No access scheme can only commit in opaque origin.
+ {"standard-but-noaccess://a.com/foo", ®ular_origin, false},
+ {"standard-but-noaccess://a.com/foo", &opaque_precursor_origin, false},
+ {"standard-but-noaccess://a.com/foo", &opaque_unique_origin, true},
+ {"standard-but-noaccess://a.com/foo", &no_access_origin, false},
+ {"standard-but-noaccess://a.com/foo", &no_access_opaque_precursor_origin,
+ false},
+ {"standard-but-noaccess://b.com/foo", &no_access_origin, false},
+ {"standard-but-noaccess://b.com/foo", &no_access_opaque_precursor_origin,
+ true},
+
+ // Local schemes can be non-standard, verify they also work as expected.
+ {"local-but-nonstandard://a.com", ®ular_origin, false},
+ {"local-but-nonstandard://a.com", &opaque_precursor_origin, false},
+ {"local-but-nonstandard://a.com", &opaque_unique_origin, true},
+ {"local-but-nonstandard://a.com", &local_non_standard_origin, true},
+ {"local-but-nonstandard://a.com",
+ &local_non_standard_opaque_precursor_origin, true},
+ };
+
+ for (const auto& test_case : kTestCases) {
+ SCOPED_TRACE(testing::Message() << "(origin, url): (" << *test_case.origin
+ << ", " << test_case.url << ")");
+ EXPECT_EQ(test_case.expected_value,
+ test_case.origin->CanBeDerivedFrom(GURL(test_case.url)));
+ }
+}
+
+TEST_F(OriginTest, GetDebugString) {
+ Origin http_origin = Origin::Create(GURL("http://192.168.9.1"));
+ EXPECT_STREQ(http_origin.GetDebugString().c_str(), "http://192.168.9.1");
+
+ Origin http_opaque_origin = http_origin.DeriveNewOpaqueOrigin();
+ EXPECT_THAT(
+ http_opaque_origin.GetDebugString().c_str(),
+ ::testing::MatchesRegex(
+ "null \\[internally: \\(\\w*\\) derived from http://192.168.9.1\\]"));
+
+ Origin data_origin = Origin::Create(GURL("data:"));
+ EXPECT_STREQ(data_origin.GetDebugString().c_str(),
+ "null [internally: (nonce TBD) anonymous]");
+
+ // The nonce of the origin will be initialized if a new opaque origin is
+ // derived.
+ Origin data_derived_origin = data_origin.DeriveNewOpaqueOrigin();
+ EXPECT_THAT(
+ data_derived_origin.GetDebugString().c_str(),
+ ::testing::MatchesRegex("null \\[internally: \\(\\w*\\) anonymous\\]"));
+
+ Origin file_origin = Origin::Create(GURL("file:///etc/passwd"));
+ EXPECT_STREQ(file_origin.GetDebugString().c_str(),
+ "file:// [internally: file://]");
+
+ Origin file_server_origin =
+ Origin::Create(GURL("file://example.com/etc/passwd"));
+ EXPECT_STREQ(file_server_origin.GetDebugString().c_str(),
+ "file:// [internally: file://example.com]");
+}
+
+} // namespace url
diff --git a/url/run_all_perftests.cc b/url/run_all_perftests.cc
new file mode 100644
index 0000000..be7a746
--- /dev/null
+++ b/url/run_all_perftests.cc
@@ -0,0 +1,14 @@
+// Copyright 2019 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/bind.h"
+#include "base/test/launcher/unit_test_launcher.h"
+#include "base/test/perf_test_suite.h"
+
+int main(int argc, char** argv) {
+ gurl_base::PerfTestSuite test_suite(argc, argv);
+ return gurl_base::LaunchUnitTestsSerially(
+ argc, argv,
+ gurl_base::BindOnce(&gurl_base::TestSuite::Run, gurl_base::Unretained(&test_suite)));
+}
diff --git a/url/run_all_unittests.cc b/url/run_all_unittests.cc
new file mode 100644
index 0000000..0f6a431
--- /dev/null
+++ b/url/run_all_unittests.cc
@@ -0,0 +1,27 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <memory>
+
+#include "base/bind.h"
+#include "base/test/launcher/unit_test_launcher.h"
+#include "base/test/test_io_thread.h"
+#include "base/test/test_suite.h"
+#include "build/build_config.h"
+
+#if !defined(OS_IOS)
+#include "mojo/core/embedder/embedder.h" // nogncheck
+#endif
+
+int main(int argc, char** argv) {
+ gurl_base::TestSuite test_suite(argc, argv);
+
+#if !defined(OS_IOS)
+ mojo::core::Init();
+#endif
+
+ return gurl_base::LaunchUnitTests(
+ argc, argv,
+ gurl_base::BindOnce(&gurl_base::TestSuite::Run, gurl_base::Unretained(&test_suite)));
+}
diff --git a/url/scheme_host_port.cc b/url/scheme_host_port.cc
new file mode 100644
index 0000000..290e8a6
--- /dev/null
+++ b/url/scheme_host_port.cc
@@ -0,0 +1,268 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/scheme_host_port.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <tuple>
+
+#include "polyfills/base/logging.h"
+#include "base/numerics/safe_conversions.h"
+#include "base/stl_util.h"
+#include "base/strings/string_number_conversions.h"
+#include "url/gurl.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
+
+namespace url {
+
+namespace {
+
+bool IsCanonicalHost(const gurl_base::StringPiece& host) {
+ std::string canon_host;
+
+ // Try to canonicalize the host (copy/pasted from net/base. :( ).
+ const Component raw_host_component(0,
+ gurl_base::checked_cast<int>(host.length()));
+ StdStringCanonOutput canon_host_output(&canon_host);
+ CanonHostInfo host_info;
+ CanonicalizeHostVerbose(host.data(), raw_host_component,
+ &canon_host_output, &host_info);
+
+ if (host_info.out_host.is_nonempty() &&
+ host_info.family != CanonHostInfo::BROKEN) {
+ // Success! Assert that there's no extra garbage.
+ canon_host_output.Complete();
+ GURL_DCHECK_EQ(host_info.out_host.len, static_cast<int>(canon_host.length()));
+ } else {
+ // Empty host, or canonicalization failed.
+ canon_host.clear();
+ }
+
+ return host == canon_host;
+}
+
+bool IsValidInput(const gurl_base::StringPiece& scheme,
+ const gurl_base::StringPiece& host,
+ uint16_t port,
+ SchemeHostPort::ConstructPolicy policy) {
+ // Empty schemes are never valid.
+ if (scheme.empty())
+ return false;
+
+ SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ bool is_standard = GetStandardSchemeType(
+ scheme.data(),
+ Component(0, gurl_base::checked_cast<int>(scheme.length())),
+ &scheme_type);
+ if (!is_standard) {
+ // To be consistent with blink, local non-standard schemes are currently
+ // allowed to be tuple origins. Nonstandard schemes don't have hostnames,
+ // so their tuple is just ("protocol", "", 0).
+ //
+ // TODO: Migrate "content:" and "externalfile:" to be standard schemes, and
+ // remove this local scheme exception.
+ if (gurl_base::Contains(GetLocalSchemes(), scheme) && host.empty() && port == 0)
+ return true;
+
+ // Otherwise, allow non-standard schemes only if the Android WebView
+ // workaround is enabled.
+ return AllowNonStandardSchemesForAndroidWebView();
+ }
+
+ switch (scheme_type) {
+ case SCHEME_WITH_HOST_AND_PORT:
+ case SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION:
+ // A URL with |scheme| is required to have the host and port (may be
+ // omitted in a serialization if it's the same as the default value).
+ // Return an invalid instance if either of them is not given.
+ if (host.empty() || port == 0)
+ return false;
+
+ // Don't do an expensive canonicalization if the host is already
+ // canonicalized.
+ GURL_DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION ||
+ IsCanonicalHost(host));
+ if (policy == SchemeHostPort::CHECK_CANONICALIZATION &&
+ !IsCanonicalHost(host)) {
+ return false;
+ }
+
+ return true;
+
+ case SCHEME_WITH_HOST:
+ if (port != 0) {
+ // Return an invalid object if a URL with the scheme never represents
+ // the port data but the given |port| is non-zero.
+ return false;
+ }
+
+ // Don't do an expensive canonicalization if the host is already
+ // canonicalized.
+ GURL_DCHECK(policy == SchemeHostPort::CHECK_CANONICALIZATION ||
+ IsCanonicalHost(host));
+ if (policy == SchemeHostPort::CHECK_CANONICALIZATION &&
+ !IsCanonicalHost(host)) {
+ return false;
+ }
+
+ return true;
+
+ case SCHEME_WITHOUT_AUTHORITY:
+ return false;
+
+ default:
+ GURL_NOTREACHED();
+ return false;
+ }
+}
+
+} // namespace
+
+SchemeHostPort::SchemeHostPort() : port_(0) {
+}
+
+SchemeHostPort::SchemeHostPort(std::string scheme,
+ std::string host,
+ uint16_t port,
+ ConstructPolicy policy)
+ : port_(0) {
+ if (!IsValidInput(scheme, host, port, policy)) {
+ GURL_DCHECK(IsInvalid());
+ return;
+ }
+
+ scheme_ = std::move(scheme);
+ host_ = std::move(host);
+ port_ = port;
+ GURL_DCHECK(!IsInvalid()) << "Scheme: " << scheme_ << " Host: " << host_
+ << " Port: " << port;
+}
+
+SchemeHostPort::SchemeHostPort(gurl_base::StringPiece scheme,
+ gurl_base::StringPiece host,
+ uint16_t port)
+ : SchemeHostPort(scheme.as_string(),
+ host.as_string(),
+ port,
+ ConstructPolicy::CHECK_CANONICALIZATION) {}
+
+SchemeHostPort::SchemeHostPort(const GURL& url) : port_(0) {
+ if (!url.is_valid())
+ return;
+
+ gurl_base::StringPiece scheme = url.scheme_piece();
+ gurl_base::StringPiece host = url.host_piece();
+
+ // A valid GURL never returns PORT_INVALID.
+ int port = url.EffectiveIntPort();
+ if (port == PORT_UNSPECIFIED) {
+ port = 0;
+ } else {
+ GURL_DCHECK_GE(port, 0);
+ GURL_DCHECK_LE(port, 65535);
+ }
+
+ if (!IsValidInput(scheme, host, port, ALREADY_CANONICALIZED))
+ return;
+
+ scheme.CopyToString(&scheme_);
+ host.CopyToString(&host_);
+ port_ = port;
+}
+
+SchemeHostPort::~SchemeHostPort() = default;
+
+bool SchemeHostPort::IsInvalid() const {
+ // It suffices to just check |scheme_| for emptiness; the other fields are
+ // never present without it.
+ GURL_DCHECK(!scheme_.empty() || host_.empty());
+ GURL_DCHECK(!scheme_.empty() || port_ == 0);
+ return scheme_.empty();
+}
+
+std::string SchemeHostPort::Serialize() const {
+ // Null checking for |parsed| in SerializeInternal is probably slower than
+ // just filling it in and discarding it here.
+ url::Parsed parsed;
+ return SerializeInternal(&parsed);
+}
+
+GURL SchemeHostPort::GetURL() const {
+ url::Parsed parsed;
+ std::string serialized = SerializeInternal(&parsed);
+
+ if (IsInvalid())
+ return GURL(std::move(serialized), parsed, false);
+
+ // SchemeHostPort does not have enough information to determine if an empty
+ // host is valid or not for the given scheme. Force re-parsing.
+ GURL_DCHECK(!scheme_.empty());
+ if (host_.empty())
+ return GURL(serialized);
+
+ // If the serialized string is passed to GURL for parsing, it will append an
+ // empty path "/". Add that here. Note: per RFC 6454 we cannot do this for
+ // normal Origin serialization.
+ GURL_DCHECK(!parsed.path.is_valid());
+ parsed.path = Component(serialized.length(), 1);
+ serialized.append("/");
+ return GURL(std::move(serialized), parsed, true);
+}
+
+bool SchemeHostPort::operator<(const SchemeHostPort& other) const {
+ return std::tie(port_, scheme_, host_) <
+ std::tie(other.port_, other.scheme_, other.host_);
+}
+
+std::string SchemeHostPort::SerializeInternal(url::Parsed* parsed) const {
+ std::string result;
+ if (IsInvalid())
+ return result;
+
+ // Reserve enough space for the "normal" case of scheme://host/.
+ result.reserve(scheme_.size() + host_.size() + 4);
+
+ if (!scheme_.empty()) {
+ parsed->scheme = Component(0, scheme_.length());
+ result.append(scheme_);
+ }
+
+ result.append(kStandardSchemeSeparator);
+
+ if (!host_.empty()) {
+ parsed->host = Component(result.length(), host_.length());
+ result.append(host_);
+ }
+
+ if (port_ == 0)
+ return result;
+
+ // Omit the port component if the port matches with the default port
+ // defined for the scheme, if any.
+ int default_port = DefaultPortForScheme(scheme_.data(),
+ static_cast<int>(scheme_.length()));
+ if (default_port == PORT_UNSPECIFIED)
+ return result;
+ if (port_ != default_port) {
+ result.push_back(':');
+ std::string port(gurl_base::NumberToString(port_));
+ parsed->port = Component(result.length(), port.length());
+ result.append(std::move(port));
+ }
+
+ return result;
+}
+
+std::ostream& operator<<(std::ostream& out,
+ const SchemeHostPort& scheme_host_port) {
+ return out << scheme_host_port.Serialize();
+}
+
+} // namespace url
diff --git a/url/scheme_host_port.h b/url/scheme_host_port.h
new file mode 100644
index 0000000..a2dded1
--- /dev/null
+++ b/url/scheme_host_port.h
@@ -0,0 +1,170 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_SCHEME_HOST_PORT_H_
+#define URL_SCHEME_HOST_PORT_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "polyfills/base/component_export.h"
+#include "base/strings/string_piece.h"
+
+class GURL;
+
+namespace url {
+
+struct Parsed;
+
+// This class represents a (scheme, host, port) tuple extracted from a URL.
+//
+// The primary purpose of this class is to represent relevant network-authority
+// information for a URL. It is _not_ an Origin, as described in RFC 6454. In
+// particular, it is generally NOT the right thing to use for security
+// decisions.
+//
+// Instead, this class is a mechanism for simplifying URLs with standard schemes
+// (that is, those which follow the generic syntax of RFC 3986) down to the
+// uniquely identifying information necessary for network fetches. This makes it
+// suitable as a cache key for a collection of active connections, for instance.
+// It may, however, be inappropriate to use as a cache key for persistent
+// storage associated with a host.
+//
+// In particular, note that:
+//
+// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax
+// (e.g. those registered with GURL as "standard schemes"). Non-standard
+// schemes such as "blob", "filesystem", "data", and "javascript" can only be
+// represented as invalid SchemeHostPort objects.
+//
+// * For example, the "file" scheme follows the standard syntax, but it is
+// important to note that the authority portion (host, port) is optional.
+// URLs without an authority portion will be represented with an empty string
+// for the host, and a port of 0 (e.g. "file:///etc/hosts" =>
+// ("file", "", 0)), and URLs with a host-only authority portion will be
+// represented with a port of 0 (e.g. "file://example.com/etc/hosts" =>
+// ("file", "example.com", 0)). See Section 3 of RFC 3986 to better understand
+// these constructs.
+//
+// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in
+// particular, it has no notion of an opaque Origin. If you need to take
+// opaque origins into account (and, if you're making security-relevant
+// decisions then you absolutely do), please use 'url::Origin' instead.
+//
+// Usage:
+//
+// * SchemeHostPort objects are commonly created from GURL objects:
+//
+// GURL url("https://example.com/");
+// url::SchemeHostPort tuple(url);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// * Objects may also be explicitly created and compared:
+//
+// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// GURL url("https://example.com/");
+// tuple == url::SchemeHostPort(url); // true
+class COMPONENT_EXPORT(URL) SchemeHostPort {
+ public:
+ // Creates an invalid (scheme, host, port) tuple, which represents an invalid
+ // or non-standard URL.
+ SchemeHostPort();
+
+ // Creates a (scheme, host, port) tuple. |host| must be a canonicalized
+ // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme|
+ // must be a standard scheme. |port| must not be 0, unless |scheme| does not
+ // support ports (e.g. 'file'). In that case, |port| must be 0.
+ //
+ // Copies the data in |scheme| and |host|.
+ SchemeHostPort(gurl_base::StringPiece scheme,
+ gurl_base::StringPiece host,
+ uint16_t port);
+
+ // Metadata influencing whether or not the constructor should sanity check
+ // host canonicalization.
+ enum ConstructPolicy { CHECK_CANONICALIZATION, ALREADY_CANONICALIZED };
+
+ // Creates a (scheme, host, port) tuple without performing sanity checking
+ // that the host and port are canonicalized. This should only be used when
+ // converting between already normalized types, and should NOT be used for
+ // IPC.
+ SchemeHostPort(std::string scheme,
+ std::string host,
+ uint16_t port,
+ ConstructPolicy policy);
+
+ // Creates a (scheme, host, port) tuple from |url|, as described at
+ // https://tools.ietf.org/html/rfc6454#section-4
+ //
+ // If |url| is invalid or non-standard, the result will be an invalid
+ // SchemeHostPort object.
+ explicit SchemeHostPort(const GURL& url);
+
+ // Copyable and movable.
+ SchemeHostPort(const SchemeHostPort&) = default;
+ SchemeHostPort& operator=(const SchemeHostPort&) = default;
+ SchemeHostPort(SchemeHostPort&&) = default;
+ SchemeHostPort& operator=(SchemeHostPort&&) = default;
+
+ ~SchemeHostPort();
+
+ // Returns the host component, in URL form. That is all IDN domain names will
+ // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and
+ // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]").
+ const std::string& host() const { return host_; }
+ const std::string& scheme() const { return scheme_; }
+ uint16_t port() const { return port_; }
+ bool IsInvalid() const;
+
+ // Serializes the SchemeHostPort tuple to a canonical form.
+ //
+ // While this string form resembles the Origin serialization specified in
+ // Section 6.2 of RFC 6454, it is important to note that invalid
+ // SchemeHostPort tuples serialize to the empty string, rather than being
+ // serialized as would an opaque Origin.
+ std::string Serialize() const;
+
+ // Efficiently returns what GURL(Serialize()) would return, without needing to
+ // re-parse the URL.
+ GURL GetURL() const;
+
+ // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports
+ // are exact matches.
+ //
+ // Note that this comparison is _not_ the same as an origin-based comparison.
+ // In particular, invalid SchemeHostPort objects match each other (and
+ // themselves). Opaque origins, on the other hand, would not.
+ bool operator==(const SchemeHostPort& other) const {
+ return port_ == other.port() && scheme_ == other.scheme() &&
+ host_ == other.host();
+ }
+ bool operator!=(const SchemeHostPort& other) const {
+ return !(*this == other);
+ }
+ // Allows SchemeHostPort to be used as a key in STL (for example, a std::set
+ // or std::map).
+ bool operator<(const SchemeHostPort& other) const;
+
+ private:
+ std::string SerializeInternal(url::Parsed* parsed) const;
+
+ std::string scheme_;
+ std::string host_;
+ uint16_t port_;
+};
+
+COMPONENT_EXPORT(URL)
+std::ostream& operator<<(std::ostream& out,
+ const SchemeHostPort& scheme_host_port);
+
+} // namespace url
+
+#endif // URL_SCHEME_HOST_PORT_H_
diff --git a/url/scheme_host_port_unittest.cc b/url/scheme_host_port_unittest.cc
new file mode 100644
index 0000000..5270c70
--- /dev/null
+++ b/url/scheme_host_port_unittest.cc
@@ -0,0 +1,285 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "base/stl_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+#include "url/scheme_host_port.h"
+#include "url/url_util.h"
+
+namespace {
+
+class SchemeHostPortTest : public testing::Test {
+ public:
+ SchemeHostPortTest() = default;
+ ~SchemeHostPortTest() override {
+ // Reset any added schemes.
+ url::ResetForTests();
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(SchemeHostPortTest);
+};
+
+void ExpectParsedUrlsEqual(const GURL& a, const GURL& b) {
+ EXPECT_EQ(a, b);
+ const url::Parsed& a_parsed = a.parsed_for_possibly_invalid_spec();
+ const url::Parsed& b_parsed = b.parsed_for_possibly_invalid_spec();
+ EXPECT_EQ(a_parsed.scheme.begin, b_parsed.scheme.begin);
+ EXPECT_EQ(a_parsed.scheme.len, b_parsed.scheme.len);
+ EXPECT_EQ(a_parsed.username.begin, b_parsed.username.begin);
+ EXPECT_EQ(a_parsed.username.len, b_parsed.username.len);
+ EXPECT_EQ(a_parsed.password.begin, b_parsed.password.begin);
+ EXPECT_EQ(a_parsed.password.len, b_parsed.password.len);
+ EXPECT_EQ(a_parsed.host.begin, b_parsed.host.begin);
+ EXPECT_EQ(a_parsed.host.len, b_parsed.host.len);
+ EXPECT_EQ(a_parsed.port.begin, b_parsed.port.begin);
+ EXPECT_EQ(a_parsed.port.len, b_parsed.port.len);
+ EXPECT_EQ(a_parsed.path.begin, b_parsed.path.begin);
+ EXPECT_EQ(a_parsed.path.len, b_parsed.path.len);
+ EXPECT_EQ(a_parsed.query.begin, b_parsed.query.begin);
+ EXPECT_EQ(a_parsed.query.len, b_parsed.query.len);
+ EXPECT_EQ(a_parsed.ref.begin, b_parsed.ref.begin);
+ EXPECT_EQ(a_parsed.ref.len, b_parsed.ref.len);
+}
+
+TEST_F(SchemeHostPortTest, Invalid) {
+ url::SchemeHostPort invalid;
+ EXPECT_EQ("", invalid.scheme());
+ EXPECT_EQ("", invalid.host());
+ EXPECT_EQ(0, invalid.port());
+ EXPECT_TRUE(invalid.IsInvalid());
+ EXPECT_EQ(invalid, invalid);
+
+ const char* urls[] = {
+ "data:text/html,Hello!", "javascript:alert(1)",
+ "file://example.com:443/etc/passwd",
+
+ // These schemes do not follow the generic URL syntax, so make sure we
+ // treat them as invalid (scheme, host, port) tuples (even though such
+ // URLs' _Origin_ might have a (scheme, host, port) tuple, they themselves
+ // do not). This is only *implicitly* checked in the code, by means of
+ // blob schemes not being standard, and filesystem schemes having type
+ // SCHEME_WITHOUT_AUTHORITY. If conditions change such that the implicit
+ // checks no longer hold, this policy should be made explicit.
+ "blob:https://example.com/uuid-goes-here",
+ "filesystem:https://example.com/temporary/yay.png"};
+
+ for (auto* test : urls) {
+ SCOPED_TRACE(test);
+ GURL url(test);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_EQ(tuple, tuple);
+ EXPECT_EQ(tuple, invalid);
+ EXPECT_EQ(invalid, tuple);
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, ExplicitConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_EQ(tuple, tuple);
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, InvalidConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {{"", "", 0},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"http", "", 80},
+ {"data", "example.com", 80},
+ {"http", "☃.net", 80},
+ {"http\nmore", "example.com", 80},
+ {"http\rmore", "example.com", 80},
+ {"http\n", "example.com", 80},
+ {"http\r", "example.com", 80},
+ {"http", "example.com\nnot-example.com", 80},
+ {"http", "example.com\rnot-example.com", 80},
+ {"http", "example.com\n", 80},
+ {"http", "example.com\r", 80},
+ {"http", "example.com", 0},
+ {"file", "", 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_EQ(tuple, tuple);
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) {
+ struct TestCases {
+ const char* scheme;
+ size_t scheme_length;
+ const char* host;
+ size_t host_length;
+ uint16_t port;
+ } cases[] = {{"http\0more", 9, "example.com", 11, 80},
+ {"http\0", 5, "example.com", 11, 80},
+ {"\0http", 5, "example.com", 11, 80},
+ {"http", 4, "example.com\0not-example.com", 27, 80},
+ {"http", 4, "example.com\0", 12, 80},
+ {"http", 4, "\0example.com", 12, 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length),
+ std::string(test.host, test.host_length),
+ test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, GURLConstruction) {
+ struct TestCases {
+ const char* url;
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } cases[] = {
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+ {"http://u:p@example.com/", "http", "example.com", 80},
+ {"http://u:p@example.com/path", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123#hash", "http", "example.com", 80},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ EXPECT_TRUE(url.is_valid());
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_EQ(tuple, tuple);
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, Serialization) {
+ struct TestCases {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://"},
+ {"file://example.com/etc/passwd", "file://example.com"},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.expected, tuple.Serialize());
+ ExpectParsedUrlsEqual(GURL(tuple.Serialize()), tuple.GetURL());
+ }
+}
+
+TEST_F(SchemeHostPortTest, Comparison) {
+ // These tuples are arranged in increasing order:
+ struct SchemeHostPorts {
+ const char* scheme;
+ const char* host;
+ uint16_t port;
+ } tuples[] = {
+ {"http", "a", 80},
+ {"http", "b", 80},
+ {"https", "a", 80},
+ {"https", "b", 80},
+ {"http", "a", 81},
+ {"http", "b", 81},
+ {"https", "a", 81},
+ {"https", "b", 81},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(tuples); i++) {
+ url::SchemeHostPort current(tuples[i].scheme, tuples[i].host,
+ tuples[i].port);
+ for (size_t j = i; j < gurl_base::size(tuples); j++) {
+ url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host,
+ tuples[j].port);
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
+}
+
+// Some schemes have optional authority. Make sure that GURL conversion from
+// SchemeHostPort is not opinionated in that regard. For more info, See
+// crbug.com/820194, where we considered all SchemeHostPorts with
+// SCHEME_WITH_HOST (i.e., without ports) as valid with empty hosts, even though
+// most are not (e.g. chrome URLs).
+TEST_F(SchemeHostPortTest, EmptyHostGurlConversion) {
+ url::AddStandardScheme("chrome", url::SCHEME_WITH_HOST);
+
+ GURL chrome_url("chrome:");
+ EXPECT_FALSE(chrome_url.is_valid());
+
+ url::SchemeHostPort chrome_tuple("chrome", "", 0);
+ EXPECT_FALSE(chrome_tuple.GetURL().is_valid());
+ ExpectParsedUrlsEqual(GURL(chrome_tuple.Serialize()), chrome_tuple.GetURL());
+ ExpectParsedUrlsEqual(chrome_url, chrome_tuple.GetURL());
+}
+
+} // namespace url
diff --git a/url/third_party/mozilla/LICENSE.txt b/url/third_party/mozilla/LICENSE.txt
new file mode 100644
index 0000000..ac40837
--- /dev/null
+++ b/url/third_party/mozilla/LICENSE.txt
@@ -0,0 +1,65 @@
+Copyright 2007, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-------------------------------------------------------------------------------
+
+The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is
+licensed separately as follows:
+
+The contents of this file are subject to the Mozilla Public License Version
+1.1 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.mozilla.org/MPL/
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+for the specific language governing rights and limitations under the
+License.
+
+The Original Code is mozilla.org code.
+
+The Initial Developer of the Original Code is
+Netscape Communications Corporation.
+Portions created by the Initial Developer are Copyright (C) 1998
+the Initial Developer. All Rights Reserved.
+
+Contributor(s):
+ Darin Fisher (original author)
+
+Alternatively, the contents of this file may be used under the terms of
+either the GNU General Public License Version 2 or later (the "GPL"), or
+the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+in which case the provisions of the GPL or the LGPL are applicable instead
+of those above. If you wish to allow use of your version of this file only
+under the terms of either the GPL or the LGPL, and not to allow others to
+use your version of this file under the terms of the MPL, indicate your
+decision by deleting the provisions above and replace them with the notice
+and other provisions required by the GPL or the LGPL. If you do not delete
+the provisions above, a recipient may use your version of this file under
+the terms of any one of the MPL, the GPL or the LGPL.
diff --git a/url/third_party/mozilla/OWNERS b/url/third_party/mozilla/OWNERS
new file mode 100644
index 0000000..3605f48
--- /dev/null
+++ b/url/third_party/mozilla/OWNERS
@@ -0,0 +1 @@
+# COMPONENT: Internals>Core
diff --git a/url/third_party/mozilla/README.chromium b/url/third_party/mozilla/README.chromium
new file mode 100644
index 0000000..ef396d3
--- /dev/null
+++ b/url/third_party/mozilla/README.chromium
@@ -0,0 +1,8 @@
+Name: url_parse
+URL: http://mxr.mozilla.org/comm-central/source/mozilla/netwerk/base/src/nsURLParsers.cpp
+License: BSD and MPL 1.1/GPL 2.0/LGPL 2.1
+License File: LICENSE.txt
+
+Description:
+
+The file url_parse.cc is based on nsURLParsers.cc from Mozilla.
diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc
new file mode 100644
index 0000000..8756cf7
--- /dev/null
+++ b/url/third_party/mozilla/url_parse.cc
@@ -0,0 +1,945 @@
+/* Based on nsURLParsers.cc from Mozilla
+ * -------------------------------------
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Darin Fisher (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "url/third_party/mozilla/url_parse.h"
+
+#include <stdlib.h>
+
+#include "polyfills/base/logging.h"
+#include "url/url_parse_internal.h"
+#include "url/url_util.h"
+#include "url/url_util_internal.h"
+
+namespace url {
+
+namespace {
+
+// Returns true if the given character is a valid digit to use in a port.
+inline bool IsPortDigit(gurl_base::char16 ch) {
+ return ch >= '0' && ch <= '9';
+}
+
+// Returns the offset of the next authority terminator in the input starting
+// from start_offset. If no terminator is found, the return value will be equal
+// to spec_len.
+template<typename CHAR>
+int FindNextAuthorityTerminator(const CHAR* spec,
+ int start_offset,
+ int spec_len) {
+ for (int i = start_offset; i < spec_len; i++) {
+ if (IsAuthorityTerminator(spec[i]))
+ return i;
+ }
+ return spec_len; // Not found.
+}
+
+template<typename CHAR>
+void ParseUserInfo(const CHAR* spec,
+ const Component& user,
+ Component* username,
+ Component* password) {
+ // Find the first colon in the user section, which separates the username and
+ // password.
+ int colon_offset = 0;
+ while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
+ colon_offset++;
+
+ if (colon_offset < user.len) {
+ // Found separator: <username>:<password>
+ *username = Component(user.begin, colon_offset);
+ *password = MakeRange(user.begin + colon_offset + 1,
+ user.begin + user.len);
+ } else {
+ // No separator, treat everything as the username
+ *username = user;
+ *password = Component();
+ }
+}
+
+template<typename CHAR>
+void ParseServerInfo(const CHAR* spec,
+ const Component& serverinfo,
+ Component* hostname,
+ Component* port_num) {
+ if (serverinfo.len == 0) {
+ // No server info, host name is empty.
+ hostname->reset();
+ port_num->reset();
+ return;
+ }
+
+ // If the host starts with a left-bracket, assume the entire host is an
+ // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
+ // This assumption will be overridden if we find a right-bracket.
+ //
+ // Our IPv6 address canonicalization code requires both brackets to exist,
+ // but the ability to locate an incomplete address can still be useful.
+ int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
+ int colon = -1;
+
+ // Find the last right-bracket, and the last colon.
+ for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
+ switch (spec[i]) {
+ case ']':
+ ipv6_terminator = i;
+ break;
+ case ':':
+ colon = i;
+ break;
+ }
+ }
+
+ if (colon > ipv6_terminator) {
+ // Found a port number: <hostname>:<port>
+ *hostname = MakeRange(serverinfo.begin, colon);
+ if (hostname->len == 0)
+ hostname->reset();
+ *port_num = MakeRange(colon + 1, serverinfo.end());
+ } else {
+ // No port: <hostname>
+ *hostname = serverinfo;
+ port_num->reset();
+ }
+}
+
+// Given an already-identified auth section, breaks it into its consituent
+// parts. The port number will be parsed and the resulting integer will be
+// filled into the given *port variable, or -1 if there is no port number or it
+// is invalid.
+template<typename CHAR>
+void DoParseAuthority(const CHAR* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ GURL_DCHECK(auth.is_valid()) << "We should always get an authority";
+ if (auth.len == 0) {
+ username->reset();
+ password->reset();
+ hostname->reset();
+ port_num->reset();
+ return;
+ }
+
+ // Search backwards for @, which is the separator between the user info and
+ // the server info.
+ int i = auth.begin + auth.len - 1;
+ while (i > auth.begin && spec[i] != '@')
+ i--;
+
+ if (spec[i] == '@') {
+ // Found user info: <user-info>@<server-info>
+ ParseUserInfo(spec, Component(auth.begin, i - auth.begin),
+ username, password);
+ ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len),
+ hostname, port_num);
+ } else {
+ // No user info, everything is server info.
+ username->reset();
+ password->reset();
+ ParseServerInfo(spec, auth, hostname, port_num);
+ }
+}
+
+template <typename CHAR>
+inline void FindQueryAndRefParts(const CHAR* spec,
+ const Component& path,
+ int* query_separator,
+ int* ref_separator) {
+ int path_end = path.begin + path.len;
+ for (int i = path.begin; i < path_end; i++) {
+ switch (spec[i]) {
+ case '?':
+ // Only match the query string if it precedes the reference fragment
+ // and when we haven't found one already.
+ if (*query_separator < 0)
+ *query_separator = i;
+ break;
+ case '#':
+ // Record the first # sign only.
+ if (*ref_separator < 0) {
+ *ref_separator = i;
+ return;
+ }
+ break;
+ }
+ }
+}
+
+template<typename CHAR>
+void ParsePath(const CHAR* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
+
+ // Special case when there is no path.
+ if (path.len == -1) {
+ filepath->reset();
+ query->reset();
+ ref->reset();
+ return;
+ }
+ GURL_DCHECK(path.len > 0) << "We should never have 0 length paths";
+
+ // Search for first occurrence of either ? or #.
+ int query_separator = -1; // Index of the '?'
+ int ref_separator = -1; // Index of the '#'
+ FindQueryAndRefParts(spec, path, &query_separator, &ref_separator);
+
+ // Markers pointing to the character after each of these corresponding
+ // components. The code below words from the end back to the beginning,
+ // and will update these indices as it finds components that exist.
+ int file_end, query_end;
+
+ // Ref fragment: from the # to the end of the path.
+ int path_end = path.begin + path.len;
+ if (ref_separator >= 0) {
+ file_end = query_end = ref_separator;
+ *ref = MakeRange(ref_separator + 1, path_end);
+ } else {
+ file_end = query_end = path_end;
+ ref->reset();
+ }
+
+ // Query fragment: everything from the ? to the next boundary (either the end
+ // of the path or the ref fragment).
+ if (query_separator >= 0) {
+ file_end = query_separator;
+ *query = MakeRange(query_separator + 1, query_end);
+ } else {
+ query->reset();
+ }
+
+ // File path: treat an empty file path as no file path.
+ if (file_end != path.begin)
+ *filepath = MakeRange(path.begin, file_end);
+ else
+ filepath->reset();
+}
+
+template<typename CHAR>
+bool DoExtractScheme(const CHAR* url,
+ int url_len,
+ Component* scheme) {
+ // Skip leading whitespace and control characters.
+ int begin = 0;
+ while (begin < url_len && ShouldTrimFromURL(url[begin]))
+ begin++;
+ if (begin == url_len)
+ return false; // Input is empty or all whitespace.
+
+ // Find the first colon character.
+ for (int i = begin; i < url_len; i++) {
+ if (url[i] == ':') {
+ *scheme = MakeRange(begin, i);
+ return true;
+ }
+ }
+ return false; // No colon found: no scheme
+}
+
+// Fills in all members of the Parsed structure except for the scheme.
+//
+// |spec| is the full spec being parsed, of length |spec_len|.
+// |after_scheme| is the character immediately following the scheme (after the
+// colon) where we'll begin parsing.
+//
+// Compatability data points. I list "host", "path" extracted:
+// Input IE6 Firefox Us
+// ----- -------------- -------------- --------------
+// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
+// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
+// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+//
+// (*) Interestingly, although IE fails to load these URLs, its history
+// canonicalizer handles them, meaning if you've been to the corresponding
+// "http://foo.com/" link, it will be colored.
+template <typename CHAR>
+void DoParseAfterScheme(const CHAR* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+ int after_slashes = after_scheme + num_slashes;
+
+ // First split into two main parts, the authority (username, password, host,
+ // and port) and the full path (path, query, and reference).
+ Component authority;
+ Component full_path;
+
+ // Found "//<some data>", looks like an authority section. Treat everything
+ // from there to the next slash (or end of spec) to be the authority. Note
+ // that we ignore the number of slashes and treat it as the authority.
+ int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
+ authority = Component(after_slashes, end_auth - after_slashes);
+
+ if (end_auth == spec_len) // No beginning of path found.
+ full_path = Component();
+ else // Everything starting from the slash to the end is the path.
+ full_path = Component(end_auth, spec_len - end_auth);
+
+ // Now parse those two sub-parts.
+ DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
+ &parsed->host, &parsed->port);
+ ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// The main parsing function for standard URLs. Standard URLs have a scheme,
+// host, path, etc.
+template<typename CHAR>
+void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ GURL_DCHECK(spec_len >= 0);
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ int after_scheme;
+ if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
+ after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
+ } else {
+ // Say there's no scheme when there is no colon. We could also say that
+ // everything is the scheme. Both would produce an invalid URL, but this way
+ // seems less wrong in more cases.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ }
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+template<typename CHAR>
+void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ GURL_DCHECK(spec_len >= 0);
+
+ // Get the unused parts of the URL out of the way.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->path.reset(); // May use this; reset for convenience.
+ parsed->ref.reset(); // May use this; reset for convenience.
+ parsed->query.reset(); // May use this; reset for convenience.
+ parsed->clear_inner_parsed(); // May use this; reset for convenience.
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (begin == spec_len) {
+ parsed->scheme.reset();
+ return;
+ }
+
+ int inner_start = -1;
+
+ // Extract the scheme. We also handle the case where there is no scheme.
+ if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ if (parsed->scheme.end() == spec_len - 1)
+ return;
+
+ inner_start = parsed->scheme.end() + 1;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ parsed->scheme.reset();
+ return;
+ }
+
+ Component inner_scheme;
+ const CHAR* inner_spec = &spec[inner_start];
+ int inner_spec_len = spec_len - inner_start;
+
+ if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ inner_scheme.begin += inner_start;
+
+ if (inner_scheme.end() == spec_len - 1)
+ return;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ // The best we can do is return "filesystem://".
+ return;
+ }
+
+ Parsed inner_parsed;
+
+ if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) {
+ // File URLs are special.
+ ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) {
+ // Filesystem URLs don't nest.
+ return;
+ } else if (IsStandard(spec, inner_scheme)) {
+ // All "normal" URLs.
+ DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else {
+ return;
+ }
+
+ // All members of inner_parsed need to be offset by inner_start.
+ // If we had any scheme that supported nesting more than one level deep,
+ // we'd have to recurse into the inner_parsed's inner_parsed when
+ // adjusting by inner_start.
+ inner_parsed.scheme.begin += inner_start;
+ inner_parsed.username.begin += inner_start;
+ inner_parsed.password.begin += inner_start;
+ inner_parsed.host.begin += inner_start;
+ inner_parsed.port.begin += inner_start;
+ inner_parsed.query.begin += inner_start;
+ inner_parsed.ref.begin += inner_start;
+ inner_parsed.path.begin += inner_start;
+
+ // Query and ref move from inner_parsed to parsed.
+ parsed->query = inner_parsed.query;
+ inner_parsed.query.reset();
+ parsed->ref = inner_parsed.ref;
+ inner_parsed.ref.reset();
+
+ parsed->set_inner_parsed(inner_parsed);
+ if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
+ inner_parsed.inner_parsed()) {
+ return;
+ }
+
+ // The path in inner_parsed should start with a slash, then have a filesystem
+ // type followed by a slash. From the first slash up to but excluding the
+ // second should be what it keeps; the rest goes to parsed. If the path ends
+ // before the second slash, it's still pretty clear what the user meant, so
+ // we'll let that through.
+ if (!IsURLSlash(spec[inner_parsed.path.begin])) {
+ return;
+ }
+ int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash
+ while (inner_path_end < spec_len &&
+ !IsURLSlash(spec[inner_path_end]))
+ ++inner_path_end;
+ parsed->path.begin = inner_path_end;
+ int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
+ parsed->path.len = inner_parsed.path.len - new_inner_path_length;
+ parsed->inner_parsed()->path.len = new_inner_path_length;
+}
+
+// Initializes a path URL which is merely a scheme followed by a path. Examples
+// include "about:foo" and "javascript:alert('bar');"
+template<typename CHAR>
+void DoParsePathURL(const CHAR* spec, int spec_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ // Get the non-path and non-scheme parts of the URL out of the way, we never
+ // use them.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->path.reset();
+ parsed->query.reset();
+ parsed->ref.reset();
+
+ // Strip leading & trailing spaces and control characters.
+ int scheme_begin = 0;
+ TrimURL(spec, &scheme_begin, &spec_len, trim_path_end);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (scheme_begin == spec_len) {
+ parsed->scheme.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ int path_begin;
+ // Extract the scheme, with the path being everything following. We also
+ // handle the case where there is no scheme.
+ if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin,
+ &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += scheme_begin;
+ path_begin = parsed->scheme.end() + 1;
+ } else {
+ // No scheme case.
+ parsed->scheme.reset();
+ path_begin = scheme_begin;
+ }
+
+ if (path_begin == spec_len)
+ return;
+ GURL_DCHECK_LT(path_begin, spec_len);
+
+ ParsePath(spec,
+ MakeRange(path_begin, spec_len),
+ &parsed->path,
+ &parsed->query,
+ &parsed->ref);
+}
+
+template<typename CHAR>
+void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ GURL_DCHECK(spec_len >= 0);
+
+ // Get the non-path and non-scheme parts of the URL out of the way, we never
+ // use them.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->ref.reset();
+ parsed->query.reset(); // May use this; reset for convenience.
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (begin == spec_len) {
+ parsed->scheme.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ int path_begin = -1;
+ int path_end = -1;
+
+ // Extract the scheme, with the path being everything following. We also
+ // handle the case where there is no scheme.
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ if (parsed->scheme.end() != spec_len - 1) {
+ path_begin = parsed->scheme.end() + 1;
+ path_end = spec_len;
+ }
+ } else {
+ // No scheme found, just path.
+ parsed->scheme.reset();
+ path_begin = begin;
+ path_end = spec_len;
+ }
+
+ // Split [path_begin, path_end) into a path + query.
+ for (int i = path_begin; i < path_end; ++i) {
+ if (spec[i] == '?') {
+ parsed->query = MakeRange(i + 1, path_end);
+ path_end = i;
+ break;
+ }
+ }
+
+ // For compatability with the standard URL parser, treat no path as
+ // -1, rather than having a length of 0
+ if (path_begin == path_end) {
+ parsed->path.reset();
+ } else {
+ parsed->path = MakeRange(path_begin, path_end);
+ }
+}
+
+// Converts a port number in a string to an integer. We'd like to just call
+// sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
+// we copy the digits to a small stack buffer (since we know the maximum number
+// of digits in a valid port number) that we can NULL terminate.
+template<typename CHAR>
+int DoParsePort(const CHAR* spec, const Component& component) {
+ // Easy success case when there is no port.
+ const int kMaxDigits = 5;
+ if (!component.is_nonempty())
+ return PORT_UNSPECIFIED;
+
+ // Skip over any leading 0s.
+ Component digits_comp(component.end(), 0);
+ for (int i = 0; i < component.len; i++) {
+ if (spec[component.begin + i] != '0') {
+ digits_comp = MakeRange(component.begin + i, component.end());
+ break;
+ }
+ }
+ if (digits_comp.len == 0)
+ return 0; // All digits were 0.
+
+ // Verify we don't have too many digits (we'll be copying to our buffer so
+ // we need to double-check).
+ if (digits_comp.len > kMaxDigits)
+ return PORT_INVALID;
+
+ // Copy valid digits to the buffer.
+ char digits[kMaxDigits + 1]; // +1 for null terminator
+ for (int i = 0; i < digits_comp.len; i++) {
+ CHAR ch = spec[digits_comp.begin + i];
+ if (!IsPortDigit(ch)) {
+ // Invalid port digit, fail.
+ return PORT_INVALID;
+ }
+ digits[i] = static_cast<char>(ch);
+ }
+
+ // Null-terminate the string and convert to integer. Since we guarantee
+ // only digits, atoi's lack of error handling is OK.
+ digits[digits_comp.len] = 0;
+ int port = atoi(digits);
+ if (port > 65535)
+ return PORT_INVALID; // Out of range.
+ return port;
+}
+
+template<typename CHAR>
+void DoExtractFileName(const CHAR* spec,
+ const Component& path,
+ Component* file_name) {
+ // Handle empty paths: they have no file names.
+ if (!path.is_nonempty()) {
+ file_name->reset();
+ return;
+ }
+
+ // Extract the filename range from the path which is between
+ // the last slash and the following semicolon.
+ int file_end = path.end();
+ for (int i = path.end() - 1; i >= path.begin; i--) {
+ if (spec[i] == ';') {
+ file_end = i;
+ } else if (IsURLSlash(spec[i])) {
+ // File name is everything following this character to the end
+ *file_name = MakeRange(i + 1, file_end);
+ return;
+ }
+ }
+
+ // No slash found, this means the input was degenerate (generally paths
+ // will start with a slash). Let's call everything the file name.
+ *file_name = MakeRange(path.begin, file_end);
+ return;
+}
+
+template<typename CHAR>
+bool DoExtractQueryKeyValue(const CHAR* spec,
+ Component* query,
+ Component* key,
+ Component* value) {
+ if (!query->is_nonempty())
+ return false;
+
+ int start = query->begin;
+ int cur = start;
+ int end = query->end();
+
+ // We assume the beginning of the input is the beginning of the "key" and we
+ // skip to the end of it.
+ key->begin = cur;
+ while (cur < end && spec[cur] != '&' && spec[cur] != '=')
+ cur++;
+ key->len = cur - key->begin;
+
+ // Skip the separator after the key (if any).
+ if (cur < end && spec[cur] == '=')
+ cur++;
+
+ // Find the value part.
+ value->begin = cur;
+ while (cur < end && spec[cur] != '&')
+ cur++;
+ value->len = cur - value->begin;
+
+ // Finally skip the next separator if any
+ if (cur < end && spec[cur] == '&')
+ cur++;
+
+ // Save the new query
+ *query = MakeRange(cur, end);
+ return true;
+}
+
+} // namespace
+
+Parsed::Parsed() : potentially_dangling_markup(false), inner_parsed_(NULL) {}
+
+Parsed::Parsed(const Parsed& other)
+ : scheme(other.scheme),
+ username(other.username),
+ password(other.password),
+ host(other.host),
+ port(other.port),
+ path(other.path),
+ query(other.query),
+ ref(other.ref),
+ potentially_dangling_markup(other.potentially_dangling_markup),
+ inner_parsed_(NULL) {
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+}
+
+Parsed& Parsed::operator=(const Parsed& other) {
+ if (this != &other) {
+ scheme = other.scheme;
+ username = other.username;
+ password = other.password;
+ host = other.host;
+ port = other.port;
+ path = other.path;
+ query = other.query;
+ ref = other.ref;
+ potentially_dangling_markup = other.potentially_dangling_markup;
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+ else
+ clear_inner_parsed();
+ }
+ return *this;
+}
+
+Parsed::~Parsed() {
+ delete inner_parsed_;
+}
+
+int Parsed::Length() const {
+ if (ref.is_valid())
+ return ref.end();
+ return CountCharactersBefore(REF, false);
+}
+
+int Parsed::CountCharactersBefore(ComponentType type,
+ bool include_delimiter) const {
+ if (type == SCHEME)
+ return scheme.begin;
+
+ // There will be some characters after the scheme like "://" and we don't
+ // know how many. Search forwards for the next thing until we find one.
+ int cur = 0;
+ if (scheme.is_valid())
+ cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme.
+
+ if (username.is_valid()) {
+ if (type <= USERNAME)
+ return username.begin;
+ cur = username.end() + 1; // Advance over the '@' or ':' at the end.
+ }
+
+ if (password.is_valid()) {
+ if (type <= PASSWORD)
+ return password.begin;
+ cur = password.end() + 1; // Advance over the '@' at the end.
+ }
+
+ if (host.is_valid()) {
+ if (type <= HOST)
+ return host.begin;
+ cur = host.end();
+ }
+
+ if (port.is_valid()) {
+ if (type < PORT || (type == PORT && include_delimiter))
+ return port.begin - 1; // Back over delimiter.
+ if (type == PORT)
+ return port.begin; // Don't want delimiter counted.
+ cur = port.end();
+ }
+
+ if (path.is_valid()) {
+ if (type <= PATH)
+ return path.begin;
+ cur = path.end();
+ }
+
+ if (query.is_valid()) {
+ if (type < QUERY || (type == QUERY && include_delimiter))
+ return query.begin - 1; // Back over delimiter.
+ if (type == QUERY)
+ return query.begin; // Don't want delimiter counted.
+ cur = query.end();
+ }
+
+ if (ref.is_valid()) {
+ if (type == REF && !include_delimiter)
+ return ref.begin; // Back over delimiter.
+
+ // When there is a ref and we get here, the component we wanted was before
+ // this and not found, so we always know the beginning of the ref is right.
+ return ref.begin - 1; // Don't want delimiter counted.
+ }
+
+ return cur;
+}
+
+Component Parsed::GetContent() const {
+ const int begin = CountCharactersBefore(USERNAME, false);
+ const int len = Length() - begin;
+ // For compatability with the standard URL parser, we treat no content as
+ // -1, rather than having a length of 0 (we normally wouldn't care so
+ // much for these non-standard URLs).
+ return len ? Component(begin, len) : Component();
+}
+
+bool ExtractScheme(const char* url, int url_len, Component* scheme) {
+ return DoExtractScheme(url, url_len, scheme);
+}
+
+bool ExtractScheme(const gurl_base::char16* url, int url_len, Component* scheme) {
+ return DoExtractScheme(url, url_len, scheme);
+}
+
+// This handles everything that may be an authority terminator, including
+// backslash. For special backslash handling see DoParseAfterScheme.
+bool IsAuthorityTerminator(gurl_base::char16 ch) {
+ return IsURLSlash(ch) || ch == '?' || ch == '#';
+}
+
+void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name) {
+ DoExtractFileName(url, path, file_name);
+}
+
+void ExtractFileName(const gurl_base::char16* url,
+ const Component& path,
+ Component* file_name) {
+ DoExtractFileName(url, path, file_name);
+}
+
+bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value) {
+ return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+bool ExtractQueryKeyValue(const gurl_base::char16* url,
+ Component* query,
+ Component* key,
+ Component* value) {
+ return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+void ParseAuthority(const gurl_base::char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+int ParsePort(const char* url, const Component& port) {
+ return DoParsePort(url, port);
+}
+
+int ParsePort(const gurl_base::char16* url, const Component& port) {
+ return DoParsePort(url, port);
+}
+
+void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParseStandardURL(const gurl_base::char16* url, int url_len, Parsed* parsed) {
+ DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
+}
+
+void ParsePathURL(const gurl_base::char16* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed) {
+ DoParsePathURL(url, url_len, trim_path_end, parsed);
+}
+
+void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseFileSystemURL(const gurl_base::char16* url, int url_len, Parsed* parsed) {
+ DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const gurl_base::char16* url, int url_len, Parsed* parsed) {
+ DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParsePathInternal(const char* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParsePathInternal(const gurl_base::char16* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParseAfterScheme(const char* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+void ParseAfterScheme(const gurl_base::char16* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+} // namespace url
diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h
new file mode 100644
index 0000000..54b2af2
--- /dev/null
+++ b/url/third_party/mozilla/url_parse.h
@@ -0,0 +1,375 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
+#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
+
+#include "polyfills/base/component_export.h"
+#include "base/strings/string16.h"
+
+namespace url {
+
+// Component ------------------------------------------------------------------
+
+// Represents a substring for URL parsing.
+struct Component {
+ Component() : begin(0), len(-1) {}
+
+ // Normal constructor: takes an offset and a length.
+ Component(int b, int l) : begin(b), len(l) {}
+
+ int end() const {
+ return begin + len;
+ }
+
+ // Returns true if this component is valid, meaning the length is given. Even
+ // valid components may be empty to record the fact that they exist.
+ bool is_valid() const {
+ return (len != -1);
+ }
+
+ // Returns true if the given component is specified on false, the component
+ // is either empty or invalid.
+ bool is_nonempty() const {
+ return (len > 0);
+ }
+
+ void reset() {
+ begin = 0;
+ len = -1;
+ }
+
+ bool operator==(const Component& other) const {
+ return begin == other.begin && len == other.len;
+ }
+
+ int begin; // Byte offset in the string of this component.
+ int len; // Will be -1 if the component is unspecified.
+};
+
+// Helper that returns a component created with the given begin and ending
+// points. The ending point is non-inclusive.
+inline Component MakeRange(int begin, int end) {
+ return Component(begin, end - begin);
+}
+
+// Parsed ---------------------------------------------------------------------
+
+// A structure that holds the identified parts of an input URL. This structure
+// does NOT store the URL itself. The caller will have to store the URL text
+// and its corresponding Parsed structure separately.
+//
+// Typical usage would be:
+//
+// Parsed parsed;
+// Component scheme;
+// if (!ExtractScheme(url, url_len, &scheme))
+// return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
+//
+// if (IsStandardScheme(url, scheme)) // Not provided by this component
+// ParseStandardURL(url, url_len, &parsed);
+// else if (IsFileURL(url, scheme)) // Not provided by this component
+// ParseFileURL(url, url_len, &parsed);
+// else
+// ParsePathURL(url, url_len, &parsed);
+//
+struct COMPONENT_EXPORT(URL) Parsed {
+ // Identifies different components.
+ enum ComponentType {
+ SCHEME,
+ USERNAME,
+ PASSWORD,
+ HOST,
+ PORT,
+ PATH,
+ QUERY,
+ REF,
+ };
+
+ // The default constructor is sufficient for the components, but inner_parsed_
+ // requires special handling.
+ Parsed();
+ Parsed(const Parsed&);
+ Parsed& operator=(const Parsed&);
+ ~Parsed();
+
+ // Returns the length of the URL (the end of the last component).
+ //
+ // Note that for some invalid, non-canonical URLs, this may not be the length
+ // of the string. For example "http://": the parsed structure will only
+ // contain an entry for the four-character scheme, and it doesn't know about
+ // the "://". For all other last-components, it will return the real length.
+ int Length() const;
+
+ // Returns the number of characters before the given component if it exists,
+ // or where the component would be if it did exist. This will return the
+ // string length if the component would be appended to the end.
+ //
+ // Note that this can get a little funny for the port, query, and ref
+ // components which have a delimiter that is not counted as part of the
+ // component. The |include_delimiter| flag controls if you want this counted
+ // as part of the component or not when the component exists.
+ //
+ // This example shows the difference between the two flags for two of these
+ // delimited components that is present (the port and query) and one that
+ // isn't (the reference). The components that this flag affects are marked
+ // with a *.
+ // 0 1 2
+ // 012345678901234567890
+ // Example input: http://foo:80/?query
+ // include_delim=true, ...=false ("<-" indicates different)
+ // SCHEME: 0 0
+ // USERNAME: 5 5
+ // PASSWORD: 5 5
+ // HOST: 7 7
+ // *PORT: 10 11 <-
+ // PATH: 13 13
+ // *QUERY: 14 15 <-
+ // *REF: 20 20
+ //
+ int CountCharactersBefore(ComponentType type, bool include_delimiter) const;
+
+ // Scheme without the colon: "http://foo"/ would have a scheme of "http".
+ // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
+ // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed
+ // to start at the beginning of the string if there are preceeding whitespace
+ // or control characters.
+ Component scheme;
+
+ // Username. Specified in URLs with an @ sign before the host. See |password|
+ Component username;
+
+ // Password. The length will be -1 if unspecified, 0 if specified but empty.
+ // Not all URLs with a username have a password, as in "http://me@host/".
+ // The password is separated form the username with a colon, as in
+ // "http://me:secret@host/"
+ Component password;
+
+ // Host name.
+ Component host;
+
+ // Port number.
+ Component port;
+
+ // Path, this is everything following the host name, stopping at the query of
+ // ref delimiter (if any). Length will be -1 if unspecified. This includes
+ // the preceeding slash, so the path on http://www.google.com/asdf" is
+ // "/asdf". As a result, it is impossible to have a 0 length path, it will
+ // be -1 in cases like "http://host?foo".
+ // Note that we treat backslashes the same as slashes.
+ Component path;
+
+ // Stuff between the ? and the # after the path. This does not include the
+ // preceeding ? character. Length will be -1 if unspecified, 0 if there is
+ // a question mark but no query string.
+ Component query;
+
+ // Indicated by a #, this is everything following the hash sign (not
+ // including it). If there are multiple hash signs, we'll use the last one.
+ // Length will be -1 if there is no hash sign, or 0 if there is one but
+ // nothing follows it.
+ Component ref;
+
+ // The URL spec from the character after the scheme: until the end of the
+ // URL, regardless of the scheme. This is mostly useful for 'opaque' non-
+ // hierarchical schemes like data: and javascript: as a convient way to get
+ // the string with the scheme stripped off.
+ Component GetContent() const;
+
+ // True if the URL's source contained a raw `<` character, and whitespace was
+ // removed from the URL during parsing
+ //
+ // TODO(mkwst): Link this to something in a spec if
+ // https://github.com/whatwg/url/pull/284 lands.
+ bool potentially_dangling_markup;
+
+ // This is used for nested URL types, currently only filesystem. If you
+ // parse a filesystem URL, the resulting Parsed will have a nested
+ // inner_parsed_ to hold the parsed inner URL's component information.
+ // For all other url types [including the inner URL], it will be NULL.
+ Parsed* inner_parsed() const {
+ return inner_parsed_;
+ }
+
+ void set_inner_parsed(const Parsed& inner_parsed) {
+ if (!inner_parsed_)
+ inner_parsed_ = new Parsed(inner_parsed);
+ else
+ *inner_parsed_ = inner_parsed;
+ }
+
+ void clear_inner_parsed() {
+ if (inner_parsed_) {
+ delete inner_parsed_;
+ inner_parsed_ = NULL;
+ }
+ }
+
+ private:
+ Parsed* inner_parsed_; // This object is owned and managed by this struct.
+};
+
+// Initialization functions ---------------------------------------------------
+//
+// These functions parse the given URL, filling in all of the structure's
+// components. These functions can not fail, they will always do their best
+// at interpreting the input given.
+//
+// The string length of the URL MUST be specified, we do not check for NULLs
+// at any point in the process, and will actually handle embedded NULLs.
+//
+// IMPORTANT: These functions do NOT hang on to the given pointer or copy it
+// in any way. See the comment above the struct.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// StandardURL is for when the scheme is known to be one that has an
+// authority (host) like "http". This function will not handle weird ones
+// like "about:" and "javascript:", or do the right thing for "file:" URLs.
+COMPONENT_EXPORT(URL)
+void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
+COMPONENT_EXPORT(URL)
+void ParseStandardURL(const gurl_base::char16* url, int url_len, Parsed* parsed);
+
+// PathURL is for when the scheme is known not to have an authority (host)
+// section but that aren't file URLs either. The scheme is parsed, and
+// everything after the scheme is considered as the path. This is used for
+// things like "about:" and "javascript:"
+COMPONENT_EXPORT(URL)
+void ParsePathURL(const char* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed);
+COMPONENT_EXPORT(URL)
+void ParsePathURL(const gurl_base::char16* url,
+ int url_len,
+ bool trim_path_end,
+ Parsed* parsed);
+
+// FileURL is for file URLs. There are some special rules for interpreting
+// these.
+COMPONENT_EXPORT(URL)
+void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+COMPONENT_EXPORT(URL)
+void ParseFileURL(const gurl_base::char16* url, int url_len, Parsed* parsed);
+
+// Filesystem URLs are structured differently than other URLs.
+COMPONENT_EXPORT(URL)
+void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed);
+COMPONENT_EXPORT(URL)
+void ParseFileSystemURL(const gurl_base::char16* url, int url_len, Parsed* parsed);
+
+// MailtoURL is for mailto: urls. They are made up scheme,path,query
+COMPONENT_EXPORT(URL)
+void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+COMPONENT_EXPORT(URL)
+void ParseMailtoURL(const gurl_base::char16* url, int url_len, Parsed* parsed);
+
+// Helper functions -----------------------------------------------------------
+
+// Locates the scheme according to the URL parser's rules. This function is
+// designed so the caller can find the scheme and call the correct Init*
+// function according to their known scheme types.
+//
+// It also does not perform any validation on the scheme.
+//
+// This function will return true if the scheme is found and will put the
+// scheme's range into *scheme. False means no scheme could be found. Note
+// that a URL beginning with a colon has a scheme, but it is empty, so this
+// function will return true but *scheme will = (0,0).
+//
+// The scheme is found by skipping spaces and control characters at the
+// beginning, and taking everything from there to the first colon to be the
+// scheme. The character at scheme.end() will be the colon (we may enhance
+// this to handle full width colons or something, so don't count on the
+// actual character value). The character at scheme.end()+1 will be the
+// beginning of the rest of the URL, be it the authority or the path (or the
+// end of the string).
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+bool ExtractScheme(const char* url, int url_len, Component* scheme);
+COMPONENT_EXPORT(URL)
+bool ExtractScheme(const gurl_base::char16* url, int url_len, Component* scheme);
+
+// Returns true if ch is a character that terminates the authority segment
+// of a URL.
+COMPONENT_EXPORT(URL) bool IsAuthorityTerminator(gurl_base::char16 ch);
+
+// Does a best effort parse of input |spec|, in range |auth|. If a particular
+// component is not found, it will be set to invalid.
+COMPONENT_EXPORT(URL)
+void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+COMPONENT_EXPORT(URL)
+void ParseAuthority(const gurl_base::char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+
+// Computes the integer port value from the given port component. The port
+// component should have been identified by one of the init functions on
+// |Parsed| for the given input url.
+//
+// The return value will be a positive integer between 0 and 64K, or one of
+// the two special values below.
+enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
+COMPONENT_EXPORT(URL) int ParsePort(const char* url, const Component& port);
+COMPONENT_EXPORT(URL)
+int ParsePort(const gurl_base::char16* url, const Component& port);
+
+// Extracts the range of the file name in the given url. The path must
+// already have been computed by the parse function, and the matching URL
+// and extracted path are provided to this function. The filename is
+// defined as being everything from the last slash/backslash of the path
+// to the end of the path.
+//
+// The file name will be empty if the path is empty or there is nothing
+// following the last slash.
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name);
+COMPONENT_EXPORT(URL)
+void ExtractFileName(const gurl_base::char16* url,
+ const Component& path,
+ Component* file_name);
+
+// Extract the first key/value from the range defined by |*query|. Updates
+// |*query| to start at the end of the extracted key/value pair. This is
+// designed for use in a loop: you can keep calling it with the same query
+// object and it will iterate over all items in the query.
+//
+// Some key/value pairs may have the key, the value, or both be empty (for
+// example, the query string "?&"). These will be returned. Note that an empty
+// last parameter "foo.com?" or foo.com?a&" will not be returned, this case
+// is the same as "done."
+//
+// The initial query component should not include the '?' (this is the default
+// for parsed URLs).
+//
+// If no key/value are found |*key| and |*value| will be unchanged and it will
+// return false.
+COMPONENT_EXPORT(URL)
+bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value);
+COMPONENT_EXPORT(URL)
+bool ExtractQueryKeyValue(const gurl_base::char16* url,
+ Component* query,
+ Component* key,
+ Component* value);
+
+} // namespace url
+
+#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
diff --git a/url/url_canon.cc b/url/url_canon.cc
new file mode 100644
index 0000000..1860234
--- /dev/null
+++ b/url/url_canon.cc
@@ -0,0 +1,15 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_canon.h"
+
+#include "polyfills/base/component_export.h"
+
+namespace url {
+
+template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL)) CanonOutputT<char>;
+template class EXPORT_TEMPLATE_DEFINE(COMPONENT_EXPORT(URL))
+ CanonOutputT<gurl_base::char16>;
+
+} // namespace url
diff --git a/url/url_canon.h b/url/url_canon.h
new file mode 100644
index 0000000..7e4a0ee
--- /dev/null
+++ b/url/url_canon.h
@@ -0,0 +1,1013 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_H_
+#define URL_URL_CANON_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "polyfills/base/component_export.h"
+#include "polyfills/base/export_template.h"
+#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
+
+namespace url {
+
+// Canonicalizer output -------------------------------------------------------
+
+// Base class for the canonicalizer output, this maintains a buffer and
+// supports simple resizing and append operations on it.
+//
+// It is VERY IMPORTANT that no virtual function calls be made on the common
+// code path. We only have two virtual function calls, the destructor and a
+// resize function that is called when the existing buffer is not big enough.
+// The derived class is then in charge of setting up our buffer which we will
+// manage.
+template<typename T>
+class CanonOutputT {
+ public:
+ CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
+ }
+ virtual ~CanonOutputT() {
+ }
+
+ // Implemented to resize the buffer. This function should update the buffer
+ // pointer to point to the new buffer, and any old data up to |cur_len_| in
+ // the buffer must be copied over.
+ //
+ // The new size |sz| must be larger than buffer_len_.
+ virtual void Resize(int sz) = 0;
+
+ // Accessor for returning a character at a given position. The input offset
+ // must be in the valid range.
+ inline T at(int offset) const {
+ return buffer_[offset];
+ }
+
+ // Sets the character at the given position. The given position MUST be less
+ // than the length().
+ inline void set(int offset, T ch) {
+ buffer_[offset] = ch;
+ }
+
+ // Returns the number of characters currently in the buffer.
+ inline int length() const {
+ return cur_len_;
+ }
+
+ // Returns the current capacity of the buffer. The length() is the number of
+ // characters that have been declared to be written, but the capacity() is
+ // the number that can be written without reallocation. If the caller must
+ // write many characters at once, it can make sure there is enough capacity,
+ // write the data, then use set_size() to declare the new length().
+ int capacity() const {
+ return buffer_len_;
+ }
+
+ // Called by the user of this class to get the output. The output will NOT
+ // be NULL-terminated. Call length() to get the
+ // length.
+ const T* data() const {
+ return buffer_;
+ }
+ T* data() {
+ return buffer_;
+ }
+
+ // Shortens the URL to the new length. Used for "backing up" when processing
+ // relative paths. This can also be used if an external function writes a lot
+ // of data to the buffer (when using the "Raw" version below) beyond the end,
+ // to declare the new length.
+ //
+ // This MUST NOT be used to expand the size of the buffer beyond capacity().
+ void set_length(int new_len) {
+ cur_len_ = new_len;
+ }
+
+ // This is the most performance critical function, since it is called for
+ // every character.
+ void push_back(T ch) {
+ // In VC2005, putting this common case first speeds up execution
+ // dramatically because this branch is predicted as taken.
+ if (cur_len_ < buffer_len_) {
+ buffer_[cur_len_] = ch;
+ cur_len_++;
+ return;
+ }
+
+ // Grow the buffer to hold at least one more item. Hopefully we won't have
+ // to do this very often.
+ if (!Grow(1))
+ return;
+
+ // Actually do the insertion.
+ buffer_[cur_len_] = ch;
+ cur_len_++;
+ }
+
+ // Appends the given string to the output.
+ void Append(const T* str, int str_len) {
+ if (cur_len_ + str_len > buffer_len_) {
+ if (!Grow(cur_len_ + str_len - buffer_len_))
+ return;
+ }
+ for (int i = 0; i < str_len; i++)
+ buffer_[cur_len_ + i] = str[i];
+ cur_len_ += str_len;
+ }
+
+ void ReserveSizeIfNeeded(int estimated_size) {
+ // Reserve a bit extra to account for escaped chars.
+ if (estimated_size > buffer_len_)
+ Resize(estimated_size + 8);
+ }
+
+ protected:
+ // Grows the given buffer so that it can fit at least |min_additional|
+ // characters. Returns true if the buffer could be resized, false on OOM.
+ bool Grow(int min_additional) {
+ static const int kMinBufferLen = 16;
+ int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
+ do {
+ if (new_len >= (1 << 30)) // Prevent overflow below.
+ return false;
+ new_len *= 2;
+ } while (new_len < buffer_len_ + min_additional);
+ Resize(new_len);
+ return true;
+ }
+
+ T* buffer_;
+ int buffer_len_;
+
+ // Used characters in the buffer.
+ int cur_len_;
+};
+
+// Simple implementation of the CanonOutput using new[]. This class
+// also supports a static buffer so if it is allocated on the stack, most
+// URLs can be canonicalized with no heap allocations.
+template<typename T, int fixed_capacity = 1024>
+class RawCanonOutputT : public CanonOutputT<T> {
+ public:
+ RawCanonOutputT() : CanonOutputT<T>() {
+ this->buffer_ = fixed_buffer_;
+ this->buffer_len_ = fixed_capacity;
+ }
+ ~RawCanonOutputT() override {
+ if (this->buffer_ != fixed_buffer_)
+ delete[] this->buffer_;
+ }
+
+ void Resize(int sz) override {
+ T* new_buf = new T[sz];
+ memcpy(new_buf, this->buffer_,
+ sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
+ if (this->buffer_ != fixed_buffer_)
+ delete[] this->buffer_;
+ this->buffer_ = new_buf;
+ this->buffer_len_ = sz;
+ }
+
+ protected:
+ T fixed_buffer_[fixed_capacity];
+};
+
+// Explicitely instantiate commonly used instatiations.
+extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL))
+ CanonOutputT<char>;
+extern template class EXPORT_TEMPLATE_DECLARE(COMPONENT_EXPORT(URL))
+ CanonOutputT<gurl_base::char16>;
+
+// Normally, all canonicalization output is in narrow characters. We support
+// the templates so it can also be used internally if a wide buffer is
+// required.
+typedef CanonOutputT<char> CanonOutput;
+typedef CanonOutputT<gurl_base::char16> CanonOutputW;
+
+template<int fixed_capacity>
+class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
+template<int fixed_capacity>
+class RawCanonOutputW : public RawCanonOutputT<gurl_base::char16, fixed_capacity> {};
+
+// Character set converter ----------------------------------------------------
+//
+// Converts query strings into a custom encoding. The embedder can supply an
+// implementation of this class to interface with their own character set
+// conversion libraries.
+//
+// Embedders will want to see the unit test for the ICU version.
+
+class COMPONENT_EXPORT(URL) CharsetConverter {
+ public:
+ CharsetConverter() {}
+ virtual ~CharsetConverter() {}
+
+ // Converts the given input string from UTF-16 to whatever output format the
+ // converter supports. This is used only for the query encoding conversion,
+ // which does not fail. Instead, the converter should insert "invalid
+ // character" characters in the output for invalid sequences, and do the
+ // best it can.
+ //
+ // If the input contains a character not representable in the output
+ // character set, the converter should append the HTML entity sequence in
+ // decimal, (such as "你") with escaping of the ampersand, number
+ // sign, and semicolon (in the previous example it would be
+ // "%26%2320320%3B"). This rule is based on what IE does in this situation.
+ virtual void ConvertFromUTF16(const gurl_base::char16* input,
+ int input_len,
+ CanonOutput* output) = 0;
+};
+
+// Schemes --------------------------------------------------------------------
+
+// Types of a scheme representing the requirements on the data represented by
+// the authority component of a URL with the scheme.
+enum SchemeType {
+ // The authority component of a URL with the scheme has the form
+ // "username:password@host:port". The username and password entries are
+ // optional; the host may not be empty. The default value of the port can be
+ // omitted in serialization. This type occurs with network schemes like http,
+ // https, and ftp.
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION,
+ // The authority component of a URL with the scheme has the form "host:port",
+ // and does not include username or password. The default value of the port
+ // can be omitted in serialization. Used by inner URLs of filesystem URLs of
+ // origins with network hosts, from which the username and password are
+ // stripped.
+ SCHEME_WITH_HOST_AND_PORT,
+ // The authority component of an URL with the scheme has the form "host", and
+ // does not include port, username, or password. Used when the hosts are not
+ // network addresses; for example, schemes used internally by the browser.
+ SCHEME_WITH_HOST,
+ // A URL with the scheme doesn't have the authority component.
+ SCHEME_WITHOUT_AUTHORITY,
+};
+
+// Whitespace -----------------------------------------------------------------
+
+// Searches for whitespace that should be removed from the middle of URLs, and
+// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
+// are preserved, which is what most browsers do. A pointer to the output will
+// be returned, and the length of that output will be in |output_len|.
+//
+// This should be called before parsing if whitespace removal is desired (which
+// it normally is when you are canonicalizing).
+//
+// If no whitespace is removed, this function will not use the buffer and will
+// return a pointer to the input, to avoid the extra copy. If modification is
+// required, the given |buffer| will be used and the returned pointer will
+// point to the beginning of the buffer.
+//
+// Therefore, callers should not use the buffer, since it may actually be empty,
+// use the computed pointer and |*output_len| instead.
+//
+// If |input| contained both removable whitespace and a raw `<` character,
+// |potentially_dangling_markup| will be set to `true`. Otherwise, it will be
+// left untouched.
+COMPONENT_EXPORT(URL)
+const char* RemoveURLWhitespace(const char* input,
+ int input_len,
+ CanonOutputT<char>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup);
+COMPONENT_EXPORT(URL)
+const gurl_base::char16* RemoveURLWhitespace(const gurl_base::char16* input,
+ int input_len,
+ CanonOutputT<gurl_base::char16>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup);
+
+// IDN ------------------------------------------------------------------------
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must fall in the ASCII range, but will be encoded in UTF-16.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, returns false. The output in this case is undefined.
+COMPONENT_EXPORT(URL)
+bool IDNToASCII(const gurl_base::char16* src, int src_len, CanonOutputW* output);
+
+// Piece-by-piece canonicalizers ----------------------------------------------
+//
+// These individual canonicalizers append the canonicalized versions of the
+// corresponding URL component to the given std::string. The spec and the
+// previously-identified range of that component are the input. The range of
+// the canonicalized component will be written to the output component.
+//
+// These functions all append to the output so they can be chained. Make sure
+// the output is empty when you start.
+//
+// These functions returns boolean values indicating success. On failure, they
+// will attempt to write something reasonable to the output so that, if
+// displayed to the user, they will recognise it as something that's messed up.
+// Nothing more should ever be done with these invalid URLs, however.
+
+// Scheme: Appends the scheme and colon to the URL. The output component will
+// indicate the range of characters up to but not including the colon.
+//
+// Canonical URLs always have a scheme. If the scheme is not present in the
+// input, this will just write the colon to indicate an empty scheme. Does not
+// append slashes which will be needed before any authority components for most
+// URLs.
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeScheme(const char* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeScheme(const gurl_base::char16* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme);
+
+// User info: username/password. If present, this will add the delimiters so
+// the output will be "<username>:<password>@" or "<username>@". Empty
+// username/password pairs, or empty passwords, will get converted to
+// nonexistent in the canonical version.
+//
+// The components for the username and password refer to ranges in the
+// respective source strings. Usually, these will be the same string, which
+// is legal as long as the two components don't overlap.
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeUserInfo(const char* username_source,
+ const Component& username,
+ const char* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeUserInfo(const gurl_base::char16* username_source,
+ const Component& username,
+ const gurl_base::char16* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password);
+
+// This structure holds detailed state exported from the IP/Host canonicalizers.
+// Additional fields may be added as callers require them.
+struct CanonHostInfo {
+ CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
+
+ // Convenience function to test if family is an IP address.
+ bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
+
+ // This field summarizes how the input was classified by the canonicalizer.
+ enum Family {
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ // canonicalizer is concerned, it should be treated as a
+ // hostname.
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ // IPv4 address where truncation occurred, or something
+ // containing the special characters :[] which did not parse
+ // as an IPv6 address. Never attempt to connect to this
+ // address, because it might actually succeed!
+ IPV4, // - Successfully canonicalized as an IPv4 address.
+ IPV6, // - Successfully canonicalized as an IPv6 address.
+ };
+ Family family;
+
+ // If |family| is IPV4, then this is the number of nonempty dot-separated
+ // components in the input text, from 1 to 4. If |family| is not IPV4,
+ // this value is undefined.
+ int num_ipv4_components;
+
+ // Location of host within the canonicalized output.
+ // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
+ // CanonicalizeHostVerbose() always sets it.
+ Component out_host;
+
+ // |address| contains the parsed IP Address (if any) in its first
+ // AddressLength() bytes, in network order. If IsIPAddress() is false
+ // AddressLength() will return zero and the content of |address| is undefined.
+ unsigned char address[16];
+
+ // Convenience function to calculate the length of an IP address corresponding
+ // to the current IP version in |family|, if any. For use with |address|.
+ int AddressLength() const {
+ return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);
+ }
+};
+
+
+// Host.
+//
+// The 8-bit version requires UTF-8 encoding. Use this version when you only
+// need to know whether canonicalization succeeded.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeHost(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeHost(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host);
+
+// Extended version of CanonicalizeHost, which returns additional information.
+// Use this when you need to know whether the hostname was an IP address.
+// A successful return is indicated by host_info->family != BROKEN. See the
+// definition of CanonHostInfo above for details.
+COMPONENT_EXPORT(URL)
+void CanonicalizeHostVerbose(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+COMPONENT_EXPORT(URL)
+void CanonicalizeHostVerbose(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+
+// Canonicalizes a string according to the host canonicalization rules. Unlike
+// CanonicalizeHost, this will not check for IP addresses which can change the
+// meaning (and canonicalization) of the components. This means it is possible
+// to call this for sub-components of a host name without corruption.
+//
+// As an example, "01.02.03.04.com" is a canonical hostname. If you called
+// CanonicalizeHost on the substring "01.02.03.04" it will get "fixed" to
+// "1.2.3.4" which will produce an invalid host name when reassembled. This
+// can happen more than one might think because all numbers by themselves are
+// considered IP addresses; so "5" canonicalizes to "0.0.0.5".
+//
+// Be careful: Because Punycode works on each dot-separated substring as a
+// unit, you should only pass this function substrings that represent complete
+// dot-separated subcomponents of the original host. Even if you have ASCII
+// input, percent-escaped characters will have different meanings if split in
+// the middle.
+//
+// Returns true if the host was valid. This function will treat a 0-length
+// host as valid (because it's designed to be used for substrings) while the
+// full version above will mark empty hosts as broken.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeHostSubstring(const char* spec,
+ const Component& host,
+ CanonOutput* output);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeHostSubstring(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output);
+
+// IP addresses.
+//
+// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
+// an IP address, it will canonicalize it as such, appending it to |output|.
+// Additional status information is returned via the |*host_info| parameter.
+// See the definition of CanonHostInfo above for details.
+//
+// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
+// the input is unescaped and name-prepped, etc. It should not normally be
+// necessary or wise to call this directly.
+COMPONENT_EXPORT(URL)
+void CanonicalizeIPAddress(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+COMPONENT_EXPORT(URL)
+void CanonicalizeIPAddress(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+
+// Port: this function will add the colon for the port if a port is present.
+// The caller can pass PORT_UNSPECIFIED as the
+// default_port_for_scheme argument if there is no default port.
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+bool CanonicalizePort(const char* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port);
+COMPONENT_EXPORT(URL)
+bool CanonicalizePort(const gurl_base::char16* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port);
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+COMPONENT_EXPORT(URL)
+int DefaultPortForScheme(const char* scheme, int scheme_len);
+
+// Path. If the input does not begin in a slash (including if the input is
+// empty), we'll prepend a slash to the path to make it canonical.
+//
+// The 8-bit version assumes UTF-8 encoding, but does not verify the validity
+// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
+// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
+// an issue. Somebody giving us an 8-bit path is responsible for generating
+// the path that the server expects (we'll escape high-bit characters), so
+// if something is invalid, it's their problem.
+COMPONENT_EXPORT(URL)
+bool CanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+COMPONENT_EXPORT(URL)
+bool CanonicalizePath(const gurl_base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+
+// Canonicalizes the input as a file path. This is like CanonicalizePath except
+// that it also handles Windows drive specs. For example, the path can begin
+// with "c|\" and it will get properly canonicalized to "C:/".
+// The string will be appended to |*output| and |*out_path| will be updated.
+//
+// The 8-bit version requires UTF-8 encoding.
+COMPONENT_EXPORT(URL)
+bool FileCanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+COMPONENT_EXPORT(URL)
+bool FileCanonicalizePath(const gurl_base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+
+// Query: Prepends the ? if needed.
+//
+// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
+// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
+// "invalid character." This function can not fail, we always just try to do
+// our best for crazy input here since web pages can set it themselves.
+//
+// This will convert the given input into the output encoding that the given
+// character set converter object provides. The converter will only be called
+// if necessary, for ASCII input, no conversions are necessary.
+//
+// The converter can be NULL. In this case, the output encoding will be UTF-8.
+COMPONENT_EXPORT(URL)
+void CanonicalizeQuery(const char* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query);
+COMPONENT_EXPORT(URL)
+void CanonicalizeQuery(const gurl_base::char16* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query);
+
+// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
+// canonicalizer that does not produce ASCII output). The output is
+// guaranteed to be valid UTF-8.
+//
+// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
+// the "Unicode replacement character" for the confusing bits and copy the rest.
+COMPONENT_EXPORT(URL)
+void CanonicalizeRef(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+COMPONENT_EXPORT(URL)
+void CanonicalizeRef(const gurl_base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path);
+
+// Full canonicalizer ---------------------------------------------------------
+//
+// These functions replace any string contents, rather than append as above.
+// See the above piece-by-piece functions for information specific to
+// canonicalizing individual components.
+//
+// The output will be ASCII except the reference fragment, which may be UTF-8.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// Use for standard URLs with authorities and paths.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeStandardURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Use for file URLs.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeFileURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Use for filesystem URLs.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeFileSystemURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Use for path URLs such as javascript. This does not modify the path in any
+// way, for example, by escaping it.
+COMPONENT_EXPORT(URL)
+bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizePathURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Use for mailto URLs. This "canonicalizes" the URL into a path and query
+// component. It does not attempt to merge "to" fields. It uses UTF-8 for
+// the query encoding if there is a query. This is because a mailto URL is
+// really intended for an external mail program, and the encoding of a page,
+// etc. which would influence a query encoding normally are irrelevant.
+COMPONENT_EXPORT(URL)
+bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool CanonicalizeMailtoURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Part replacer --------------------------------------------------------------
+
+// Internal structure used for storing separate strings for each component.
+// The basic canonicalization functions use this structure internally so that
+// component replacement (different strings for different components) can be
+// treated on the same code path as regular canonicalization (the same string
+// for each component).
+//
+// A Parsed structure usually goes along with this. Those components identify
+// offsets within these strings, so that they can all be in the same string,
+// or spread arbitrarily across different ones.
+//
+// This structures does not own any data. It is the caller's responsibility to
+// ensure that the data the pointers point to stays in scope and is not
+// modified.
+template<typename CHAR>
+struct URLComponentSource {
+ // Constructor normally used by callers wishing to replace components. This
+ // will make them all NULL, which is no replacement. The caller would then
+ // override the components they want to replace.
+ URLComponentSource()
+ : scheme(NULL),
+ username(NULL),
+ password(NULL),
+ host(NULL),
+ port(NULL),
+ path(NULL),
+ query(NULL),
+ ref(NULL) {
+ }
+
+ // Constructor normally used internally to initialize all the components to
+ // point to the same spec.
+ explicit URLComponentSource(const CHAR* default_value)
+ : scheme(default_value),
+ username(default_value),
+ password(default_value),
+ host(default_value),
+ port(default_value),
+ path(default_value),
+ query(default_value),
+ ref(default_value) {
+ }
+
+ const CHAR* scheme;
+ const CHAR* username;
+ const CHAR* password;
+ const CHAR* host;
+ const CHAR* port;
+ const CHAR* path;
+ const CHAR* query;
+ const CHAR* ref;
+};
+
+// This structure encapsulates information on modifying a URL. Each component
+// may either be left unchanged, replaced, or deleted.
+//
+// By default, each component is unchanged. For those components that should be
+// modified, call either Set* or Clear* to modify it.
+//
+// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
+// IN SCOPE BY THE CALLER for as long as this object exists!
+//
+// Prefer the 8-bit replacement version if possible since it is more efficient.
+template<typename CHAR>
+class Replacements {
+ public:
+ Replacements() {
+ }
+
+ // Scheme
+ void SetScheme(const CHAR* s, const Component& comp) {
+ sources_.scheme = s;
+ components_.scheme = comp;
+ }
+ // Note: we don't have a ClearScheme since this doesn't make any sense.
+ bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
+
+ // Username
+ void SetUsername(const CHAR* s, const Component& comp) {
+ sources_.username = s;
+ components_.username = comp;
+ }
+ void ClearUsername() {
+ sources_.username = Placeholder();
+ components_.username = Component();
+ }
+ bool IsUsernameOverridden() const { return sources_.username != NULL; }
+
+ // Password
+ void SetPassword(const CHAR* s, const Component& comp) {
+ sources_.password = s;
+ components_.password = comp;
+ }
+ void ClearPassword() {
+ sources_.password = Placeholder();
+ components_.password = Component();
+ }
+ bool IsPasswordOverridden() const { return sources_.password != NULL; }
+
+ // Host
+ void SetHost(const CHAR* s, const Component& comp) {
+ sources_.host = s;
+ components_.host = comp;
+ }
+ void ClearHost() {
+ sources_.host = Placeholder();
+ components_.host = Component();
+ }
+ bool IsHostOverridden() const { return sources_.host != NULL; }
+
+ // Port
+ void SetPort(const CHAR* s, const Component& comp) {
+ sources_.port = s;
+ components_.port = comp;
+ }
+ void ClearPort() {
+ sources_.port = Placeholder();
+ components_.port = Component();
+ }
+ bool IsPortOverridden() const { return sources_.port != NULL; }
+
+ // Path
+ void SetPath(const CHAR* s, const Component& comp) {
+ sources_.path = s;
+ components_.path = comp;
+ }
+ void ClearPath() {
+ sources_.path = Placeholder();
+ components_.path = Component();
+ }
+ bool IsPathOverridden() const { return sources_.path != NULL; }
+
+ // Query
+ void SetQuery(const CHAR* s, const Component& comp) {
+ sources_.query = s;
+ components_.query = comp;
+ }
+ void ClearQuery() {
+ sources_.query = Placeholder();
+ components_.query = Component();
+ }
+ bool IsQueryOverridden() const { return sources_.query != NULL; }
+
+ // Ref
+ void SetRef(const CHAR* s, const Component& comp) {
+ sources_.ref = s;
+ components_.ref = comp;
+ }
+ void ClearRef() {
+ sources_.ref = Placeholder();
+ components_.ref = Component();
+ }
+ bool IsRefOverridden() const { return sources_.ref != NULL; }
+
+ // Getters for the internal data. See the variables below for how the
+ // information is encoded.
+ const URLComponentSource<CHAR>& sources() const { return sources_; }
+ const Parsed& components() const { return components_; }
+
+ private:
+ // Returns a pointer to a static empty string that is used as a placeholder
+ // to indicate a component should be deleted (see below).
+ const CHAR* Placeholder() {
+ static const CHAR empty_cstr = 0;
+ return &empty_cstr;
+ }
+
+ // We support three states:
+ //
+ // Action | Source Component
+ // -----------------------+--------------------------------------------------
+ // Don't change component | NULL (unused)
+ // Replace component | (replacement string) (replacement component)
+ // Delete component | (non-NULL) (invalid component: (0,-1))
+ //
+ // We use a pointer to the empty string for the source when the component
+ // should be deleted.
+ URLComponentSource<CHAR> sources_;
+ Parsed components_;
+};
+
+// The base must be an 8-bit canonical URL.
+COMPONENT_EXPORT(URL)
+bool ReplaceStandardURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplaceStandardURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Filesystem URLs can only have the path, query, or ref replaced.
+// All other components will be ignored.
+COMPONENT_EXPORT(URL)
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Replacing some parts of a file URL is not permitted. Everything except
+// the host, path, query, and ref will be ignored.
+COMPONENT_EXPORT(URL)
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Path URLs can only have the scheme and path replaced. All other components
+// will be ignored.
+COMPONENT_EXPORT(URL)
+bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Mailto URLs can only have the scheme, path, and query replaced.
+// All other components will be ignored.
+COMPONENT_EXPORT(URL)
+bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed);
+
+// Relative URL ---------------------------------------------------------------
+
+// Given an input URL or URL fragment |fragment|, determines if it is a
+// relative or absolute URL and places the result into |*is_relative|. If it is
+// relative, the relevant portion of the URL will be placed into
+// |*relative_component| (there may have been trimmed whitespace, for example).
+// This value is passed to ResolveRelativeURL. If the input is not relative,
+// this value is UNDEFINED (it may be changed by the function).
+//
+// Returns true on success (we successfully determined the URL is relative or
+// not). Failure means that the combination of URLs doesn't make any sense.
+//
+// The base URL should always be canonical, therefore is ASCII.
+COMPONENT_EXPORT(URL)
+bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component);
+COMPONENT_EXPORT(URL)
+bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const gurl_base::char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component);
+
+// Given a canonical parsed source URL, a URL fragment known to be relative,
+// and the identified relevant portion of the relative URL (computed by
+// IsRelativeURL), this produces a new parsed canonical URL in |output| and
+// |out_parsed|.
+//
+// It also requires a flag indicating whether the base URL is a file: URL
+// which triggers additional logic.
+//
+// The base URL should be canonical and have a host (may be empty for file
+// URLs) and a path. If it doesn't have these, we can't resolve relative
+// URLs off of it and will return the base as the output with an error flag.
+// Because it is canonical is should also be ASCII.
+//
+// The query charset converter follows the same rules as CanonicalizeQuery.
+//
+// Returns true on success. On failure, the output will be "something
+// reasonable" that will be consistent and valid, just probably not what
+// was intended by the web page author or caller.
+COMPONENT_EXPORT(URL)
+bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
+COMPONENT_EXPORT(URL)
+bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const gurl_base::char16* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
+
+} // namespace url
+
+#endif // URL_URL_CANON_H_
diff --git a/url/url_canon_etc.cc b/url/url_canon_etc.cc
new file mode 100644
index 0000000..23d1235
--- /dev/null
+++ b/url/url_canon_etc.cc
@@ -0,0 +1,419 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Canonicalizers for random bits that aren't big enough for their own files.
+
+#include <string.h>
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+// Returns true if the given character should be removed from the middle of a
+// URL.
+inline bool IsRemovableURLWhitespace(int ch) {
+ return ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
+// It sucks that we have to do this, since this takes about 13% of the total URL
+// canonicalization time.
+template <typename CHAR>
+const CHAR* DoRemoveURLWhitespace(const CHAR* input,
+ int input_len,
+ CanonOutputT<CHAR>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup) {
+ // Fast verification that there's nothing that needs removal. This is the 99%
+ // case, so we want it to be fast and don't care about impacting the speed
+ // when we do find whitespace.
+ int found_whitespace = false;
+ for (int i = 0; i < input_len; i++) {
+ if (!IsRemovableURLWhitespace(input[i]))
+ continue;
+ found_whitespace = true;
+ break;
+ }
+
+ if (!found_whitespace) {
+ // Didn't find any whitespace, we don't need to do anything. We can just
+ // return the input as the output.
+ *output_len = input_len;
+ return input;
+ }
+
+ // Skip whitespace removal for `data:` URLs.
+ //
+ // TODO(mkwst): Ideally, this would use something like `gurl_base::StartsWith`, but
+ // that turns out to be difficult to do correctly given this function's
+ // character type templating.
+ if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
+ input[3] == 'a' && input[4] == ':') {
+ *output_len = input_len;
+ return input;
+ }
+
+ // Remove the whitespace into the new buffer and return it.
+ for (int i = 0; i < input_len; i++) {
+ if (!IsRemovableURLWhitespace(input[i])) {
+ if (potentially_dangling_markup && input[i] == 0x3C)
+ *potentially_dangling_markup = true;
+ buffer->push_back(input[i]);
+ }
+ }
+ *output_len = buffer->length();
+ return buffer->data();
+}
+
+// Contains the canonical version of each possible input letter in the scheme
+// (basically, lower-cased). The corresponding entry will be 0 if the letter
+// is not allowed in a scheme.
+const char kSchemeCanonical[0x80] = {
+// 00-1f: all are invalid
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
+// @ A B C D E F G H I J K L M N O
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
+// ` a b c d e f g h i j k l m n o
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// p q r s t u v w x y z { | } ~
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
+
+// This could be a table lookup as well by setting the high bit for each
+// valid character, but it's only called once per URL, and it makes the lookup
+// table easier to read not having extra stuff in it.
+inline bool IsSchemeFirstChar(unsigned char c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoScheme(const CHAR* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme) {
+ if (scheme.len <= 0) {
+ // Scheme is unspecified or empty, convert to empty by appending a colon.
+ *out_scheme = Component(output->length(), 0);
+ output->push_back(':');
+ return false;
+ }
+
+ // The output scheme starts from the current position.
+ out_scheme->begin = output->length();
+
+ // Danger: it's important that this code does not strip any characters;
+ // it only emits the canonical version (be it valid or escaped) for each
+ // of the input characters. Stripping would put it out of sync with
+ // FindAndCompareScheme, which could cause some security checks on
+ // schemes to be incorrect.
+ bool success = true;
+ int end = scheme.end();
+ for (int i = scheme.begin; i < end; i++) {
+ UCHAR ch = static_cast<UCHAR>(spec[i]);
+ char replacement = 0;
+ if (ch < 0x80) {
+ if (i == scheme.begin) {
+ // Need to do a special check for the first letter of the scheme.
+ if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
+ replacement = kSchemeCanonical[ch];
+ } else {
+ replacement = kSchemeCanonical[ch];
+ }
+ }
+
+ if (replacement) {
+ output->push_back(replacement);
+ } else if (ch == '%') {
+ // Canonicalizing the scheme multiple times should lead to the same
+ // result. Since invalid characters will be escaped, we need to preserve
+ // the percent to avoid multiple escaping. The scheme will be invalid.
+ success = false;
+ output->push_back('%');
+ } else {
+ // Invalid character, store it but mark this scheme as invalid.
+ success = false;
+
+ // This will escape the output and also handle encoding issues.
+ // Ignore the return value since we already failed.
+ AppendUTF8EscapedChar(spec, &i, end, output);
+ }
+ }
+
+ // The output scheme ends with the the current position, before appending
+ // the colon.
+ out_scheme->len = output->length() - out_scheme->begin;
+ output->push_back(':');
+ return success;
+}
+
+// The username and password components reference ranges in the corresponding
+// *_spec strings. Typically, these specs will be the same (we're
+// canonicalizing a single source string), but may be different when
+// replacing components.
+template<typename CHAR, typename UCHAR>
+bool DoUserInfo(const CHAR* username_spec,
+ const Component& username,
+ const CHAR* password_spec,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password) {
+ if (username.len <= 0 && password.len <= 0) {
+ // Common case: no user info. We strip empty username/passwords.
+ *out_username = Component();
+ *out_password = Component();
+ return true;
+ }
+
+ // Write the username.
+ out_username->begin = output->length();
+ if (username.len > 0) {
+ // This will escape characters not valid for the username.
+ AppendStringOfType(&username_spec[username.begin], username.len,
+ CHAR_USERINFO, output);
+ }
+ out_username->len = output->length() - out_username->begin;
+
+ // When there is a password, we need the separator. Note that we strip
+ // empty but specified passwords.
+ if (password.len > 0) {
+ output->push_back(':');
+ out_password->begin = output->length();
+ AppendStringOfType(&password_spec[password.begin], password.len,
+ CHAR_USERINFO, output);
+ out_password->len = output->length() - out_password->begin;
+ } else {
+ *out_password = Component();
+ }
+
+ output->push_back('@');
+ return true;
+}
+
+// Helper functions for converting port integers to strings.
+inline void WritePortInt(char* output, int output_len, int port) {
+ _itoa_s(port, output, output_len, 10);
+}
+
+// This function will prepend the colon if there will be a port.
+template<typename CHAR, typename UCHAR>
+bool DoPort(const CHAR* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port) {
+ int port_num = ParsePort(spec, port);
+ if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
+ *out_port = Component();
+ return true; // Leave port empty.
+ }
+
+ if (port_num == PORT_INVALID) {
+ // Invalid port: We'll copy the text from the input so the user can see
+ // what the error was, and mark the URL as invalid by returning false.
+ output->push_back(':');
+ out_port->begin = output->length();
+ AppendInvalidNarrowString(spec, port.begin, port.end(), output);
+ out_port->len = output->length() - out_port->begin;
+ return false;
+ }
+
+ // Convert port number back to an integer. Max port value is 5 digits, and
+ // the Parsed::ExtractPort will have made sure the integer is in range.
+ const int buf_size = 6;
+ char buf[buf_size];
+ WritePortInt(buf, buf_size, port_num);
+
+ // Append the port number to the output, preceded by a colon.
+ output->push_back(':');
+ out_port->begin = output->length();
+ for (int i = 0; i < buf_size && buf[i]; i++)
+ output->push_back(buf[i]);
+
+ out_port->len = output->length() - out_port->begin;
+ return true;
+}
+
+// clang-format off
+// Percent-escape all "C0 controls" (0x00-0x1F)
+// https://infra.spec.whatwg.org/#c0-control along with the characters ' '
+// (0x20), '"' (0x22), '<' (0x3C), '>' (0x3E), and '`' (0x60):
+const bool kShouldEscapeCharInRef[0x80] = {
+// Control characters (0x00-0x1F)
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+ true, true, true, true, true, true, true, true,
+// ' ' ! " # $ % & '
+ true, false, true, false, false, false, false, false,
+// ( ) * + , - . /
+ false, false, false, false, false, false, false, false,
+// 0 1 2 3 4 5 6 7
+ false, false, false, false, false, false, false, false,
+// 8 9 : ; < = > ?
+ false, false, false, false, true, false, true, false,
+// @ A B C D E F G
+ false, false, false, false, false, false, false, false,
+// H I J K L M N O
+ false, false, false, false, false, false, false, false,
+// P Q R S T U V W
+ false, false, false, false, false, false, false, false,
+// X Y Z [ \ ] ^ _
+ false, false, false, false, false, false, false, false,
+// ` a b c d e f g
+ true, false, false, false, false, false, false, false,
+// h i j k l m n o
+ false, false, false, false, false, false, false, false,
+// p q r s t u v w
+ false, false, false, false, false, false, false, false,
+// x y z { | } ~
+ false, false, false, false, false, false, false
+};
+// clang-format on
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeRef(const CHAR* spec,
+ const Component& ref,
+ CanonOutput* output,
+ Component* out_ref) {
+ if (ref.len < 0) {
+ // Common case of no ref.
+ *out_ref = Component();
+ return;
+ }
+
+ // Append the ref separator. Note that we need to do this even when the ref
+ // is empty but present.
+ output->push_back('#');
+ out_ref->begin = output->length();
+
+ // Now iterate through all the characters, converting to UTF-8 and validating.
+ int end = ref.end();
+ for (int i = ref.begin; i < end; i++) {
+ if (spec[i] == 0) {
+ // IE just strips NULLs, so we do too.
+ continue;
+ }
+
+ UCHAR current_char = static_cast<UCHAR>(spec[i]);
+ if (current_char < 0x80) {
+ if (kShouldEscapeCharInRef[current_char])
+ AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
+ else
+ output->push_back(static_cast<char>(spec[i]));
+ } else {
+ AppendUTF8EscapedChar(spec, &i, end, output);
+ }
+ }
+
+ out_ref->len = output->length() - out_ref->begin;
+}
+
+} // namespace
+
+const char* RemoveURLWhitespace(const char* input,
+ int input_len,
+ CanonOutputT<char>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup) {
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
+ potentially_dangling_markup);
+}
+
+const gurl_base::char16* RemoveURLWhitespace(const gurl_base::char16* input,
+ int input_len,
+ CanonOutputT<gurl_base::char16>* buffer,
+ int* output_len,
+ bool* potentially_dangling_markup) {
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
+ potentially_dangling_markup);
+}
+
+char CanonicalSchemeChar(gurl_base::char16 ch) {
+ if (ch >= 0x80)
+ return 0; // Non-ASCII is not supported by schemes.
+ return kSchemeCanonical[ch];
+}
+
+bool CanonicalizeScheme(const char* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme) {
+ return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeScheme(const gurl_base::char16* spec,
+ const Component& scheme,
+ CanonOutput* output,
+ Component* out_scheme) {
+ return DoScheme<gurl_base::char16, gurl_base::char16>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeUserInfo(const char* username_source,
+ const Component& username,
+ const char* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password) {
+ return DoUserInfo<char, unsigned char>(
+ username_source, username, password_source, password,
+ output, out_username, out_password);
+}
+
+bool CanonicalizeUserInfo(const gurl_base::char16* username_source,
+ const Component& username,
+ const gurl_base::char16* password_source,
+ const Component& password,
+ CanonOutput* output,
+ Component* out_username,
+ Component* out_password) {
+ return DoUserInfo<gurl_base::char16, gurl_base::char16>(
+ username_source, username, password_source, password,
+ output, out_username, out_password);
+}
+
+bool CanonicalizePort(const char* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port) {
+ return DoPort<char, unsigned char>(spec, port,
+ default_port_for_scheme,
+ output, out_port);
+}
+
+bool CanonicalizePort(const gurl_base::char16* spec,
+ const Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ Component* out_port) {
+ return DoPort<gurl_base::char16, gurl_base::char16>(spec, port, default_port_for_scheme,
+ output, out_port);
+}
+
+void CanonicalizeRef(const char* spec,
+ const Component& ref,
+ CanonOutput* output,
+ Component* out_ref) {
+ DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
+}
+
+void CanonicalizeRef(const gurl_base::char16* spec,
+ const Component& ref,
+ CanonOutput* output,
+ Component* out_ref) {
+ DoCanonicalizeRef<gurl_base::char16, gurl_base::char16>(spec, ref, output, out_ref);
+}
+
+} // namespace url
diff --git a/url/url_canon_filesystemurl.cc b/url/url_canon_filesystemurl.cc
new file mode 100644
index 0000000..9def892
--- /dev/null
+++ b/url/url_canon_filesystemurl.cc
@@ -0,0 +1,135 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "filesystem:file:" URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+#include "url/url_util.h"
+#include "url/url_util_internal.h"
+
+namespace url {
+
+namespace {
+
+// We use the URLComponentSource for the outer URL, as it can have replacements,
+// whereas the inner_url can't, so it uses spec.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileSystemURL(const CHAR* spec,
+ const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // filesystem only uses {scheme, path, query, ref} -- clear the rest.
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->host.reset();
+ new_parsed->port.reset();
+
+ const Parsed* inner_parsed = parsed.inner_parsed();
+ Parsed new_inner_parsed;
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("filesystem:", 11);
+ new_parsed->scheme.len = 10;
+
+ if (!parsed.inner_parsed() || !parsed.inner_parsed()->scheme.is_valid())
+ return false;
+
+ bool success = true;
+ SchemeType inner_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) {
+ new_inner_parsed.scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_inner_parsed.scheme.len = 4;
+ success &= CanonicalizePath(spec, inner_parsed->path, output,
+ &new_inner_parsed.path);
+ } else if (GetStandardSchemeType(spec, inner_parsed->scheme,
+ &inner_scheme_type)) {
+ if (inner_scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION) {
+ // Strip out the user information from the inner URL, if any.
+ inner_scheme_type = SCHEME_WITH_HOST_AND_PORT;
+ }
+ success = CanonicalizeStandardURL(
+ spec, parsed.inner_parsed()->Length(), *parsed.inner_parsed(),
+ inner_scheme_type, charset_converter, output, &new_inner_parsed);
+ } else {
+ // TODO(ericu): The URL is wrong, but should we try to output more of what
+ // we were given? Echoing back filesystem:mailto etc. doesn't seem all that
+ // useful.
+ return false;
+ }
+ // The filesystem type must be more than just a leading slash for validity.
+ success &= parsed.inner_parsed()->path.len > 1;
+
+ success &= CanonicalizePath(source.path, parsed.path, output,
+ &new_parsed->path);
+
+ // Ignore failures for query/ref since the URL can probably still be loaded.
+ CanonicalizeQuery(source.query, parsed.query, charset_converter,
+ output, &new_parsed->query);
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+ if (success)
+ new_parsed->set_inner_parsed(new_inner_parsed);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ spec, URLComponentSource<char>(spec), parsed, charset_converter, output,
+ new_parsed);
+}
+
+bool CanonicalizeFileSystemURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<gurl_base::char16, gurl_base::char16>(
+ spec, URLComponentSource<gurl_base::char16>(spec), parsed, charset_converter,
+ output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_fileurl.cc b/url/url_canon_fileurl.cc
new file mode 100644
index 0000000..ef654c7
--- /dev/null
+++ b/url/url_canon_fileurl.cc
@@ -0,0 +1,190 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "file:" URLs.
+
+#include "base/strings/string_util.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+namespace {
+
+#ifdef WIN32
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+ // (with backslashes instead of slashes as well).
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+ return begin; // Haven't consumed any characters
+
+ // A drive spec is the start of a path, so we need to add a slash for the
+ // authority terminator (typically the third slash).
+ output->push_back('/');
+
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+ // and that it is followed by a colon/pipe.
+
+ // Normalize Windows drive letters to uppercase
+ if (gurl_base::IsAsciiLower(spec[after_slashes]))
+ output->push_back(static_cast<char>(spec[after_slashes] - 'a' + 'A'));
+ else
+ output->push_back(static_cast<char>(spec[after_slashes]));
+
+ // Normalize the character following it to a colon rather than pipe.
+ output->push_back(':');
+ return after_slashes + 2;
+}
+
+#endif // WIN32
+
+template<typename CHAR, typename UCHAR>
+bool DoFileCanonicalizePath(const CHAR* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ // Copies and normalizes the "c:" at the beginning, if present.
+ out_path->begin = output->length();
+ int after_drive;
+#ifdef WIN32
+ after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
+#else
+ after_drive = path.begin;
+#endif
+
+ // Copies the rest of the path, starting from the slash following the
+ // drive colon (if any, Windows only), or the first slash of the path.
+ bool success = true;
+ if (after_drive < path.end()) {
+ // Use the regular path canonicalizer to canonicalize the rest of the
+ // path. Give it a fake output component to write into. DoCanonicalizeFile
+ // will compute the full path component.
+ Component sub_path = MakeRange(after_drive, path.end());
+ Component fake_output_path;
+ success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
+ } else {
+ // No input path, canonicalize to a slash.
+ output->push_back('/');
+ }
+
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // Things we don't set in file: URLs.
+ new_parsed->username = Component();
+ new_parsed->password = Component();
+ new_parsed->port = Component();
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_parsed->scheme.len = 4;
+
+ // Append the host. For many file URLs, this will be empty. For UNC, this
+ // will be present.
+ // TODO(brettw) This doesn't do any checking for host name validity. We
+ // should probably handle validity checking of UNC hosts differently than
+ // for regular IP hosts.
+ bool success = CanonicalizeHost(source.host, parsed.host,
+ output, &new_parsed->host);
+ success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
+ output, &new_parsed->path);
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
+ output, &new_parsed->query);
+
+ // Ignore failure for refs since the URL can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool CanonicalizeFileURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<gurl_base::char16, gurl_base::char16>(
+ URLComponentSource<gurl_base::char16>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool FileCanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoFileCanonicalizePath<char, unsigned char>(spec, path,
+ output, out_path);
+}
+
+bool FileCanonicalizePath(const gurl_base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoFileCanonicalizePath<gurl_base::char16, gurl_base::char16>(spec, path,
+ output, out_path);
+}
+
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+bool ReplaceFileURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc
new file mode 100644
index 0000000..f83dacb
--- /dev/null
+++ b/url/url_canon_host.cc
@@ -0,0 +1,430 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "polyfills/base/logging.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+// For reference, here's what IE supports:
+// Key: 0 (disallowed: failure if present in the input)
+// + (allowed either escaped or unescaped, and unmodified)
+// U (allowed escaped or unescaped but always unescaped if present in
+// escaped form)
+// E (allowed escaped or unescaped but always escaped if present in
+// unescaped form)
+// % (only allowed escaped in the input, will be unmodified).
+// I left blank alpha numeric characters.
+//
+// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+// -----------------------------------------------
+// 0 0 E E E E E E E E E E E E E E E
+// 1 E E E E E E E E E E E E E E E E
+// 2 E + E E + E + + + + + + + U U 0
+// 3 % % E + E 0 <-- Those are : ; < = > ?
+// 4 %
+// 5 U 0 U U U <-- Those are [ \ ] ^ _
+// 6 E <-- That's `
+// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
+//
+// NOTE: I didn't actually test all the control characters. Some may be
+// disallowed in the input, but they are all accepted escaped except for 0.
+// I also didn't test if characters affecting HTML parsing are allowed
+// unescaped, e.g. (") or (#), which would indicate the beginning of the path.
+// Surprisingly, space is accepted in the input and always escaped.
+
+// This table lists the canonical version of all characters we allow in the
+// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
+// value to indicate that this character should be escaped. We are a little more
+// restrictive than IE, but less restrictive than Firefox.
+//
+// Note that we disallow the % character. We will allow it when part of an
+// escape sequence, of course, but this disallows "%25". Even though IE allows
+// it, allowing it would put us in a funny state. If there was an invalid
+// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
+// Allowing percents means we'll succeed a second time, so validity would change
+// based on how many times you run the canonicalizer. We prefer to always report
+// the same vailidity, so reject this.
+const unsigned char kEsc = 0xff;
+const unsigned char kHostCharLookup[0x80] = {
+// 00-1f: all are invalid
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
+// @ A B C D E F G H I J K L M N O
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
+// ` a b c d e f g h i j k l m n o
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// p q r s t u v w x y z { | } ~
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
+
+// RFC1034 maximum FQDN length.
+constexpr int kMaxHostLength = 253;
+
+// Generous padding to account for the fact that UTS#46 normalization can cause
+// a long string to actually shrink and fit within the 253 character RFC1034
+// FQDN length limit. Note that this can still be too short for pathological
+// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
+// removed from the input by UTS#46 processing. However, this should be
+// sufficient for all normally-encountered, non-abusive hostname strings.
+constexpr int kMaxHostBufferLength = kMaxHostLength*5;
+
+const int kTempHostBufferLen = 1024;
+typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
+typedef RawCanonOutputT<gurl_base::char16, kTempHostBufferLen> StackBufferW;
+
+// Scans a host name and fills in the output flags according to what we find.
+// |has_non_ascii| will be true if there are any non-7-bit characters, and
+// |has_escaped| will be true if there is a percent sign.
+template<typename CHAR, typename UCHAR>
+void ScanHostname(const CHAR* spec,
+ const Component& host,
+ bool* has_non_ascii,
+ bool* has_escaped) {
+ int end = host.end();
+ *has_non_ascii = false;
+ *has_escaped = false;
+ for (int i = host.begin; i < end; i++) {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ *has_non_ascii = true;
+ else if (spec[i] == '%')
+ *has_escaped = true;
+ }
+}
+
+// Canonicalizes a host name that is entirely 8-bit characters (even though
+// the type holding them may be 16 bits. Escaped characters will be unescaped.
+// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
+//
+// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
+// the output.
+//
+// This function is used in two situations:
+//
+// * When the caller knows there is no non-ASCII or percent escaped
+// characters. This is what DoHost does. The result will be a completely
+// canonicalized host since we know nothing weird can happen (escaped
+// characters could be unescaped to non-7-bit, so they have to be treated
+// with suspicion at this point). It does not use the |has_non_ascii| flag.
+//
+// * When the caller has an 8-bit string that may need unescaping.
+// DoComplexHost calls us this situation to do unescaping and validation.
+// After this, it may do other IDN operations depending on the value of the
+// |*has_non_ascii| flag.
+//
+// The return value indicates if the output is a potentially valid host name.
+template<typename INCHAR, typename OUTCHAR>
+bool DoSimpleHost(const INCHAR* host,
+ int host_len,
+ CanonOutputT<OUTCHAR>* output,
+ bool* has_non_ascii) {
+ *has_non_ascii = false;
+
+ bool success = true;
+ for (int i = 0; i < host_len; ++i) {
+ unsigned int source = host[i];
+ if (source == '%') {
+ // Unescape first, if possible.
+ // Source will be used only if decode operation was successful.
+ if (!DecodeEscaped(host, &i, host_len,
+ reinterpret_cast<unsigned char*>(&source))) {
+ // Invalid escaped character. There is nothing that can make this
+ // host valid. We append an escaped percent so the URL looks reasonable
+ // and mark as failed.
+ AppendEscapedChar('%', output);
+ success = false;
+ continue;
+ }
+ }
+
+ if (source < 0x80) {
+ // We have ASCII input, we can use our lookup table.
+ unsigned char replacement = kHostCharLookup[source];
+ if (!replacement) {
+ // Invalid character, add it as percent-escaped and mark as failed.
+ AppendEscapedChar(source, output);
+ success = false;
+ } else if (replacement == kEsc) {
+ // This character is valid but should be escaped.
+ AppendEscapedChar(source, output);
+ } else {
+ // Common case, the given character is valid in a hostname, the lookup
+ // table tells us the canonical representation of that character (lower
+ // cased).
+ output->push_back(replacement);
+ }
+ } else {
+ // It's a non-ascii char. Just push it to the output.
+ // In case where we have char16 input, and char output it's safe to
+ // cast char16->char only if input string was converted to ASCII.
+ output->push_back(static_cast<OUTCHAR>(source));
+ *has_non_ascii = true;
+ }
+ }
+ return success;
+}
+
+// Canonicalizes a host that requires IDN conversion. Returns true on success
+bool DoIDNHost(const gurl_base::char16* src, int src_len, CanonOutput* output) {
+ int original_output_len = output->length(); // So we can rewind below.
+
+ // We need to escape URL before doing IDN conversion, since punicode strings
+ // cannot be escaped after they are created.
+ RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
+ bool has_non_ascii;
+ DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
+ if (url_escaped_host.length() > kMaxHostBufferLength) {
+ AppendInvalidNarrowString(src, 0, src_len, output);
+ return false;
+ }
+
+ StackBufferW wide_output;
+ if (!IDNToASCII(url_escaped_host.data(),
+ url_escaped_host.length(),
+ &wide_output)) {
+ // Some error, give up. This will write some reasonable looking
+ // representation of the string to the output.
+ AppendInvalidNarrowString(src, 0, src_len, output);
+ return false;
+ }
+
+ // Now we check the ASCII output like a normal host. It will also handle
+ // unescaping. Although we unescaped everything before this function call, if
+ // somebody does %00 as fullwidth, ICU will convert this to ASCII.
+ bool success = DoSimpleHost(wide_output.data(),
+ wide_output.length(),
+ output, &has_non_ascii);
+ if (has_non_ascii) {
+ // ICU generated something that DoSimpleHost didn't think looked like
+ // ASCII. This is quite rare, but ICU might convert some characters to
+ // percent signs which might generate new escape sequences which might in
+ // turn be invalid. An example is U+FE6A "small percent" which ICU will
+ // name prep into an ASCII percent and then we can interpret the following
+ // characters as escaped characters.
+ //
+ // If DoSimpleHost didn't think the output was ASCII, just escape the
+ // thing we gave ICU and give up. DoSimpleHost will have handled a further
+ // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
+ // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
+ // do more (like handle escaped non-ASCII sequences). Handling the escaped
+ // ASCII isn't strictly necessary, but DoSimpleHost handles this case
+ // anyway so we handle it/
+ output->set_length(original_output_len);
+ AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
+ output);
+ return false;
+ }
+ return success;
+}
+
+// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
+// UTF-16. The has_escaped flag should be set if the input string requires
+// unescaping.
+bool DoComplexHost(const char* host, int host_len,
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+ // Save the current position in the output. We may write stuff and rewind it
+ // below, so we need to know where to rewind to.
+ int begin_length = output->length();
+
+ // Points to the UTF-8 data we want to convert. This will either be the
+ // input or the unescaped version written to |*output| if necessary.
+ const char* utf8_source;
+ int utf8_source_len;
+ if (has_escaped) {
+ // Unescape before converting to UTF-16 for IDN. We write this into the
+ // output because it most likely does not require IDNization, and we can
+ // save another huge stack buffer. It will be replaced below if it requires
+ // IDN. This will also update our non-ASCII flag so we know whether the
+ // unescaped input requires IDN.
+ if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
+ // Error with some escape sequence. We'll call the current output
+ // complete. DoSimpleHost will have written some "reasonable" output.
+ return false;
+ }
+
+ // Unescaping may have left us with ASCII input, in which case the
+ // unescaped version we wrote to output is complete.
+ if (!has_non_ascii) {
+ return true;
+ }
+
+ // Save the pointer into the data was just converted (it may be appended to
+ // other data in the output buffer).
+ utf8_source = &output->data()[begin_length];
+ utf8_source_len = output->length() - begin_length;
+ } else {
+ // We don't need to unescape, use input for IDNization later. (We know the
+ // input has non-ASCII, or the simple version would have been called
+ // instead of us.)
+ utf8_source = host;
+ utf8_source_len = host_len;
+ }
+
+ // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
+ // Above, we may have used the output to write the unescaped values to, so
+ // we have to rewind it to where we started after we convert it to UTF-16.
+ StackBufferW utf16;
+ if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
+ // In this error case, the input may or may not be the output.
+ StackBuffer utf8;
+ for (int i = 0; i < utf8_source_len; i++)
+ utf8.push_back(utf8_source[i]);
+ output->set_length(begin_length);
+ AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
+ return false;
+ }
+ output->set_length(begin_length);
+
+ // This will call DoSimpleHost which will do normal ASCII canonicalization
+ // and also check for IP addresses in the outpt.
+ return DoIDNHost(utf16.data(), utf16.length(), output);
+}
+
+// UTF-16 convert host to its ASCII version. The set up is already ready for
+// the backend, so we just pass through. The has_escaped flag should be set if
+// the input string requires unescaping.
+bool DoComplexHost(const gurl_base::char16* host, int host_len,
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+ if (has_escaped) {
+ // Yikes, we have escaped characters with wide input. The escaped
+ // characters should be interpreted as UTF-8. To solve this problem,
+ // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
+ //
+ // We don't bother to optimize the conversion in the ASCII case (which
+ // *could* just be a copy) and use the UTF-8 path, because it should be
+ // very rare that host names have escaped characters, and it is relatively
+ // fast to do the conversion anyway.
+ StackBuffer utf8;
+ if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
+ AppendInvalidNarrowString(host, 0, host_len, output);
+ return false;
+ }
+
+ // Once we convert to UTF-8, we can use the 8-bit version of the complex
+ // host handling code above.
+ return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
+ has_escaped, output);
+ }
+
+ // No unescaping necessary, we can safely pass the input to ICU. This
+ // function will only get called if we either have escaped or non-ascii
+ // input, so it's safe to just use ICU now. Even if the input is ASCII,
+ // this function will do the right thing (just slower than we could).
+ return DoIDNHost(host, host_len, output);
+}
+
+template <typename CHAR, typename UCHAR>
+bool DoHostSubstring(const CHAR* spec,
+ const Component& host,
+ CanonOutput* output) {
+ bool has_non_ascii, has_escaped;
+ ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
+
+ if (has_non_ascii || has_escaped) {
+ return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
+ has_escaped, output);
+ }
+
+ const bool success =
+ DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
+ GURL_DCHECK(!has_non_ascii);
+ return success;
+}
+
+template <typename CHAR, typename UCHAR>
+void DoHost(const CHAR* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (host.len <= 0) {
+ // Empty hosts don't need anything.
+ host_info->family = CanonHostInfo::NEUTRAL;
+ host_info->out_host = Component();
+ return;
+ }
+
+ // Keep track of output's initial length, so we can rewind later.
+ const int output_begin = output->length();
+
+ if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
+ // After all the other canonicalization, check if we ended up with an IP
+ // address. IP addresses are small, so writing into this temporary buffer
+ // should not cause an allocation.
+ RawCanonOutput<64> canon_ip;
+ CanonicalizeIPAddress(output->data(),
+ MakeRange(output_begin, output->length()),
+ &canon_ip, host_info);
+
+ // If we got an IPv4/IPv6 address, copy the canonical form back to the
+ // real buffer. Otherwise, it's a hostname or broken IP, in which case
+ // we just leave it in place.
+ if (host_info->IsIPAddress()) {
+ output->set_length(output_begin);
+ output->Append(canon_ip.data(), canon_ip.length());
+ }
+ } else {
+ // Canonicalization failed. Set BROKEN to notify the caller.
+ host_info->family = CanonHostInfo::BROKEN;
+ }
+
+ host_info->out_host = MakeRange(output_begin, output->length());
+}
+
+} // namespace
+
+bool CanonicalizeHost(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host) {
+ CanonHostInfo host_info;
+ DoHost<char, unsigned char>(spec, host, output, &host_info);
+ *out_host = host_info.out_host;
+ return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+bool CanonicalizeHost(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ Component* out_host) {
+ CanonHostInfo host_info;
+ DoHost<gurl_base::char16, gurl_base::char16>(spec, host, output, &host_info);
+ *out_host = host_info.out_host;
+ return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+void CanonicalizeHostVerbose(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ DoHost<char, unsigned char>(spec, host, output, host_info);
+}
+
+void CanonicalizeHostVerbose(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ DoHost<gurl_base::char16, gurl_base::char16>(spec, host, output, host_info);
+}
+
+bool CanonicalizeHostSubstring(const char* spec,
+ const Component& host,
+ CanonOutput* output) {
+ return DoHostSubstring<char, unsigned char>(spec, host, output);
+}
+
+bool CanonicalizeHostSubstring(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output) {
+ return DoHostSubstring<gurl_base::char16, gurl_base::char16>(spec, host, output);
+}
+
+} // namespace url
diff --git a/url/url_canon_icu.cc b/url/url_canon_icu.cc
new file mode 100644
index 0000000..a9a32fd
--- /dev/null
+++ b/url/url_canon_icu.cc
@@ -0,0 +1,110 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// ICU-based character set converter.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "polyfills/base/logging.h"
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/utypes.h>
+#include "url/url_canon_icu.h"
+#include "url/url_canon_internal.h" // for _itoa_s
+
+namespace url {
+
+namespace {
+
+// Called when converting a character that can not be represented, this will
+// append an escaped version of the numerical character reference for that code
+// point. It is of the form "Ӓ" and we will escape the non-digits to
+// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
+void appendURLEscapedChar(const void* context,
+ UConverterFromUnicodeArgs* from_args,
+ const UChar* code_units,
+ int32_t length,
+ UChar32 code_point,
+ UConverterCallbackReason reason,
+ UErrorCode* err) {
+ if (reason == UCNV_UNASSIGNED) {
+ *err = U_ZERO_ERROR;
+
+ const static int prefix_len = 6;
+ const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
+
+ GURL_DCHECK(code_point < 0x110000);
+ char number[8]; // Max Unicode code point is 7 digits.
+ _itoa_s(code_point, number, 10);
+ int number_len = static_cast<int>(strlen(number));
+ ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
+
+ const static int postfix_len = 3;
+ const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
+ }
+}
+
+// A class for scoping the installation of the invalid character callback.
+class AppendHandlerInstaller {
+ public:
+ // The owner of this object must ensure that the converter is alive for the
+ // duration of this object's lifetime.
+ AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
+ &old_callback_, &old_context_, &err);
+ }
+
+ ~AppendHandlerInstaller() {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
+ }
+
+ private:
+ UConverter* converter_;
+
+ UConverterFromUCallback old_callback_;
+ const void* old_context_;
+};
+
+} // namespace
+
+ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
+ : converter_(converter) {
+}
+
+ICUCharsetConverter::~ICUCharsetConverter() = default;
+
+void ICUCharsetConverter::ConvertFromUTF16(const gurl_base::char16* input,
+ int input_len,
+ CanonOutput* output) {
+ // Install our error handler. It will be called for character that can not
+ // be represented in the destination character set.
+ AppendHandlerInstaller handler(converter_);
+
+ int begin_offset = output->length();
+ int dest_capacity = output->capacity() - begin_offset;
+ output->set_length(output->length());
+
+ do {
+ UErrorCode err = U_ZERO_ERROR;
+ char* dest = &output->data()[begin_offset];
+ int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
+ input, input_len, &err);
+ if (err != U_BUFFER_OVERFLOW_ERROR) {
+ output->set_length(begin_offset + required_capacity);
+ return;
+ }
+
+ // Output didn't fit, expand
+ dest_capacity = required_capacity;
+ output->Resize(begin_offset + dest_capacity);
+ } while (true);
+}
+
+} // namespace url
diff --git a/url/url_canon_icu.h b/url/url_canon_icu.h
new file mode 100644
index 0000000..33fc863
--- /dev/null
+++ b/url/url_canon_icu.h
@@ -0,0 +1,40 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_ICU_H_
+#define URL_URL_CANON_ICU_H_
+
+// ICU integration functions.
+
+#include "base/compiler_specific.h"
+#include "polyfills/base/component_export.h"
+#include "url/url_canon.h"
+
+typedef struct UConverter UConverter;
+
+namespace url {
+
+// An implementation of CharsetConverter that implementations can use to
+// interface the canonicalizer with ICU's conversion routines.
+class COMPONENT_EXPORT(URL) ICUCharsetConverter : public CharsetConverter {
+ public:
+ // Constructs a converter using an already-existing ICU character set
+ // converter. This converter is NOT owned by this object; the lifetime must
+ // be managed by the creator such that it is alive as long as this is.
+ ICUCharsetConverter(UConverter* converter);
+
+ ~ICUCharsetConverter() override;
+
+ void ConvertFromUTF16(const gurl_base::char16* input,
+ int input_len,
+ CanonOutput* output) override;
+
+ private:
+ // The ICU converter, not owned by this class.
+ UConverter* converter_;
+};
+
+} // namespace url
+
+#endif // URL_URL_CANON_ICU_H_
diff --git a/url/url_canon_icu_unittest.cc b/url/url_canon_icu_unittest.cc
new file mode 100644
index 0000000..55fd58f
--- /dev/null
+++ b/url/url_canon_icu_unittest.cc
@@ -0,0 +1,162 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include "base/stl_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include <unicode/ucnv.h>
+#include "url/url_canon.h"
+#include "url/url_canon_icu.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_test_utils.h"
+
+namespace url {
+
+namespace {
+
+// Wrapper around a UConverter object that managers creation and destruction.
+class UConvScoper {
+ public:
+ explicit UConvScoper(const char* charset_name) {
+ UErrorCode err = U_ZERO_ERROR;
+ converter_ = ucnv_open(charset_name, &err);
+ }
+
+ ~UConvScoper() {
+ if (converter_)
+ ucnv_close(converter_);
+ }
+
+ // Returns the converter object, may be NULL.
+ UConverter* converter() const { return converter_; }
+
+ private:
+ UConverter* converter_;
+};
+
+TEST(URLCanonIcuTest, ICUCharsetConverter) {
+ struct ICUCase {
+ const wchar_t* input;
+ const char* encoding;
+ const char* expected;
+ } icu_cases[] = {
+ // UTF-8.
+ {L"Hello, world", "utf-8", "Hello, world"},
+ {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ // Non-BMP UTF-8.
+ {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
+ // Big5
+ {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
+ // Unrepresentable character in the destination set.
+ {L"hello\x4f60\x06de\x597dworld", "big5",
+ "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(icu_cases); i++) {
+ UConvScoper conv(icu_cases[i].encoding);
+ ASSERT_TRUE(conv.converter() != NULL);
+ ICUCharsetConverter converter(conv.converter());
+
+ std::string str;
+ StdStringCanonOutput output(&str);
+
+ gurl_base::string16 input_str(
+ test_utils::TruncateWStringToUTF16(icu_cases[i].input));
+ int input_len = static_cast<int>(input_str.length());
+ converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
+ output.Complete();
+
+ EXPECT_STREQ(icu_cases[i].expected, str.c_str());
+ }
+
+ // Test string sizes around the resize boundary for the output to make sure
+ // the converter resizes as needed.
+ const int static_size = 16;
+ UConvScoper conv("utf-8");
+ ASSERT_TRUE(conv.converter());
+ ICUCharsetConverter converter(conv.converter());
+ for (int i = static_size - 2; i <= static_size + 2; i++) {
+ // Make a string with the appropriate length.
+ gurl_base::string16 input;
+ for (int ch = 0; ch < i; ch++)
+ input.push_back('a');
+
+ RawCanonOutput<static_size> output;
+ converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
+ &output);
+ EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
+ }
+}
+
+TEST(URLCanonIcuTest, QueryWithConverter) {
+ struct QueryCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* encoding;
+ const char* expected;
+ } query_cases[] = {
+ // Regular ASCII case in some different encodings.
+ {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
+ // Chinese input/output
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
+ "?q=%C4%E3%BA%C3"},
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
+ // Unencodable character in the destination character set should be
+ // escaped. The escape sequence unescapes to be the entity name:
+ // "?q=你"
+ {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
+ "?q=Chinese%26%2365319%3B"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(query_cases); i++) {
+ Component out_comp;
+
+ UConvScoper conv(query_cases[i].encoding);
+ ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
+ ICUCharsetConverter converter(conv.converter());
+
+ if (query_cases[i].input8) {
+ int len = static_cast<int>(strlen(query_cases[i].input8));
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+
+ if (query_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(query_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+ }
+
+ // Extra test for input with embedded NULL;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Component out_comp;
+ CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
+ output.Complete();
+ EXPECT_EQ("?a%20%00z%01", out_str);
+}
+
+} // namespace
+
+} // namespace url
diff --git a/url/url_canon_internal.cc b/url/url_canon_internal.cc
new file mode 100644
index 0000000..961c3b0
--- /dev/null
+++ b/url/url_canon_internal.cc
@@ -0,0 +1,433 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_canon_internal.h"
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include <cstdio>
+#include <string>
+
+#include "base/strings/utf_string_conversion_utils.h"
+
+namespace url {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+void DoAppendStringOfType(const CHAR* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ for (int i = 0; i < length; i++) {
+ if (static_cast<UCHAR>(source[i]) >= 0x80) {
+ // ReadChar will fill the code point with kUnicodeReplacementCharacter
+ // when the input is invalid, which is what we want.
+ unsigned code_point;
+ ReadUTFChar(source, &i, length, &code_point);
+ AppendUTF8EscapedValue(code_point, output);
+ } else {
+ // Just append the 7-bit character, possibly escaping it.
+ unsigned char uch = static_cast<unsigned char>(source[i]);
+ if (!IsCharOfType(uch, type))
+ AppendEscapedChar(uch, output);
+ else
+ output->push_back(uch);
+ }
+ }
+}
+
+// This function assumes the input values are all contained in 8-bit,
+// although it allows any type. Returns true if input is valid, false if not.
+template<typename CHAR, typename UCHAR>
+void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ for (int i = begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(spec[i]);
+ if (uch >= 0x80) {
+ // Handle UTF-8/16 encodings. This call will correctly handle the error
+ // case by appending the invalid character.
+ AppendUTF8EscapedChar(spec, &i, end, output);
+ } else if (uch <= ' ' || uch == 0x7f) {
+ // This function is for error handling, so we escape all control
+ // characters and spaces, but not anything else since we lack
+ // context to do something more specific.
+ AppendEscapedChar(static_cast<unsigned char>(uch), output);
+ } else {
+ output->push_back(static_cast<char>(uch));
+ }
+ }
+}
+
+// Overrides one component, see the Replacements structure for
+// what the various combionations of source pointer and component mean.
+void DoOverrideComponent(const char* override_source,
+ const Component& override_component,
+ const char** dest,
+ Component* dest_component) {
+ if (override_source) {
+ *dest = override_source;
+ *dest_component = override_component;
+ }
+}
+
+// Similar to DoOverrideComponent except that it takes a UTF-16 input and does
+// not actually set the output character pointer.
+//
+// The input is converted to UTF-8 at the end of the given buffer as a temporary
+// holding place. The component identifying the portion of the buffer used in
+// the |utf8_buffer| will be specified in |*dest_component|.
+//
+// This will not actually set any |dest| pointer like DoOverrideComponent
+// does because all of the pointers will point into the |utf8_buffer|, which
+// may get resized while we're overriding a subsequent component. Instead, the
+// caller should use the beginning of the |utf8_buffer| as the string pointer
+// for all components once all overrides have been prepared.
+bool PrepareUTF16OverrideComponent(const gurl_base::char16* override_source,
+ const Component& override_component,
+ CanonOutput* utf8_buffer,
+ Component* dest_component) {
+ bool success = true;
+ if (override_source) {
+ if (!override_component.is_valid()) {
+ // Non-"valid" component (means delete), so we need to preserve that.
+ *dest_component = Component();
+ } else {
+ // Convert to UTF-8.
+ dest_component->begin = utf8_buffer->length();
+ success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
+ override_component.len, utf8_buffer);
+ dest_component->len = utf8_buffer->length() - dest_component->begin;
+ }
+ }
+ return success;
+}
+
+} // namespace
+
+// See the header file for this array's declaration.
+const unsigned char kSharedCharTypeTable[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f
+ 0, // 0x20 ' ' (escape spaces in queries)
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 !
+ 0, // 0x22 "
+ 0, // 0x23 # (invalid in query since it marks the ref)
+ CHAR_QUERY | CHAR_USERINFO, // 0x24 $
+ CHAR_QUERY | CHAR_USERINFO, // 0x25 %
+ CHAR_QUERY | CHAR_USERINFO, // 0x26 &
+ 0, // 0x27 ' (Try to prevent XSS.)
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 (
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 )
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a *
+ CHAR_QUERY | CHAR_USERINFO, // 0x2b +
+ CHAR_QUERY | CHAR_USERINFO, // 0x2c ,
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d -
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e .
+ CHAR_QUERY, // 0x2f /
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9
+ CHAR_QUERY, // 0x3a :
+ CHAR_QUERY, // 0x3b ;
+ 0, // 0x3c < (Try to prevent certain types of XSS.)
+ CHAR_QUERY, // 0x3d =
+ 0, // 0x3e > (Try to prevent certain types of XSS.)
+ CHAR_QUERY, // 0x3f ?
+ CHAR_QUERY, // 0x40 @
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z
+ CHAR_QUERY, // 0x5b [
+ CHAR_QUERY, // 0x5c '\'
+ CHAR_QUERY, // 0x5d ]
+ CHAR_QUERY, // 0x5e ^
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _
+ CHAR_QUERY, // 0x60 `
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z
+ CHAR_QUERY, // 0x7b {
+ CHAR_QUERY, // 0x7c |
+ CHAR_QUERY, // 0x7d }
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~
+ 0, // 0x7f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
+};
+
+const char kHexCharLookup[0x10] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+};
+
+const char kCharToHexLookup[8] = {
+ 0, // 0x00 - 0x1f
+ '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39
+ 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46
+ 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66
+ 0, // 0x80 - 0x9F
+ 0, // 0xA0 - 0xBF
+ 0, // 0xC0 - 0xDF
+ 0, // 0xE0 - 0xFF
+};
+
+const gurl_base::char16 kUnicodeReplacementCharacter = 0xfffd;
+
+void AppendStringOfType(const char* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ DoAppendStringOfType<char, unsigned char>(source, length, type, output);
+}
+
+void AppendStringOfType(const gurl_base::char16* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ DoAppendStringOfType<gurl_base::char16, gurl_base::char16>(
+ source, length, type, output);
+}
+
+bool ReadUTFChar(const char* str, int* begin, int length,
+ unsigned* code_point_out) {
+ // This depends on ints and int32s being the same thing. If they're not, it
+ // will fail to compile.
+ // TODO(mmenke): This should probably be fixed.
+ if (!gurl_base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
+ !gurl_base::IsValidCharacter(*code_point_out)) {
+ *code_point_out = kUnicodeReplacementCharacter;
+ return false;
+ }
+ return true;
+}
+
+bool ReadUTFChar(const gurl_base::char16* str, int* begin, int length,
+ unsigned* code_point_out) {
+ // This depends on ints and int32s being the same thing. If they're not, it
+ // will fail to compile.
+ // TODO(mmenke): This should probably be fixed.
+ if (!gurl_base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
+ !gurl_base::IsValidCharacter(*code_point_out)) {
+ *code_point_out = kUnicodeReplacementCharacter;
+ return false;
+ }
+ return true;
+}
+
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+ CanonOutput* output) {
+ DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
+}
+
+void AppendInvalidNarrowString(const gurl_base::char16* spec, int begin, int end,
+ CanonOutput* output) {
+ DoAppendInvalidNarrowString<gurl_base::char16, gurl_base::char16>(
+ spec, begin, end, output);
+}
+
+bool ConvertUTF16ToUTF8(const gurl_base::char16* input, int input_len,
+ CanonOutput* output) {
+ bool success = true;
+ for (int i = 0; i < input_len; i++) {
+ unsigned code_point;
+ success &= ReadUTFChar(input, &i, input_len, &code_point);
+ AppendUTF8Value(code_point, output);
+ }
+ return success;
+}
+
+bool ConvertUTF8ToUTF16(const char* input, int input_len,
+ CanonOutputT<gurl_base::char16>* output) {
+ bool success = true;
+ for (int i = 0; i < input_len; i++) {
+ unsigned code_point;
+ success &= ReadUTFChar(input, &i, input_len, &code_point);
+ AppendUTF16Value(code_point, output);
+ }
+ return success;
+}
+
+void SetupOverrideComponents(const char* base,
+ const Replacements<char>& repl,
+ URLComponentSource<char>* source,
+ Parsed* parsed) {
+ // Get the source and parsed structures of the things we are replacing.
+ const URLComponentSource<char>& repl_source = repl.sources();
+ const Parsed& repl_parsed = repl.components();
+
+ DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
+ &source->scheme, &parsed->scheme);
+ DoOverrideComponent(repl_source.username, repl_parsed.username,
+ &source->username, &parsed->username);
+ DoOverrideComponent(repl_source.password, repl_parsed.password,
+ &source->password, &parsed->password);
+
+ // Our host should be empty if not present, so override the default setup.
+ DoOverrideComponent(repl_source.host, repl_parsed.host,
+ &source->host, &parsed->host);
+ if (parsed->host.len == -1)
+ parsed->host.len = 0;
+
+ DoOverrideComponent(repl_source.port, repl_parsed.port,
+ &source->port, &parsed->port);
+ DoOverrideComponent(repl_source.path, repl_parsed.path,
+ &source->path, &parsed->path);
+ DoOverrideComponent(repl_source.query, repl_parsed.query,
+ &source->query, &parsed->query);
+ DoOverrideComponent(repl_source.ref, repl_parsed.ref,
+ &source->ref, &parsed->ref);
+}
+
+bool SetupUTF16OverrideComponents(const char* base,
+ const Replacements<gurl_base::char16>& repl,
+ CanonOutput* utf8_buffer,
+ URLComponentSource<char>* source,
+ Parsed* parsed) {
+ bool success = true;
+
+ // Get the source and parsed structures of the things we are replacing.
+ const URLComponentSource<gurl_base::char16>& repl_source = repl.sources();
+ const Parsed& repl_parsed = repl.components();
+
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.scheme, repl_parsed.scheme,
+ utf8_buffer, &parsed->scheme);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.username, repl_parsed.username,
+ utf8_buffer, &parsed->username);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.password, repl_parsed.password,
+ utf8_buffer, &parsed->password);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.host, repl_parsed.host,
+ utf8_buffer, &parsed->host);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.port, repl_parsed.port,
+ utf8_buffer, &parsed->port);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.path, repl_parsed.path,
+ utf8_buffer, &parsed->path);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.query, repl_parsed.query,
+ utf8_buffer, &parsed->query);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.ref, repl_parsed.ref,
+ utf8_buffer, &parsed->ref);
+
+ // PrepareUTF16OverrideComponent will not have set the data pointer since the
+ // buffer could be resized, invalidating the pointers. We set the data
+ // pointers for affected components now that the buffer is finalized.
+ if (repl_source.scheme) source->scheme = utf8_buffer->data();
+ if (repl_source.username) source->username = utf8_buffer->data();
+ if (repl_source.password) source->password = utf8_buffer->data();
+ if (repl_source.host) source->host = utf8_buffer->data();
+ if (repl_source.port) source->port = utf8_buffer->data();
+ if (repl_source.path) source->path = utf8_buffer->data();
+ if (repl_source.query) source->query = utf8_buffer->data();
+ if (repl_source.ref) source->ref = utf8_buffer->data();
+
+ return success;
+}
+
+#ifndef WIN32
+
+int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
+ const char* format_str;
+ if (radix == 10)
+ format_str = "%d";
+ else if (radix == 16)
+ format_str = "%x";
+ else
+ return EINVAL;
+
+ int written = snprintf(buffer, size_in_chars, format_str, value);
+ if (static_cast<size_t>(written) >= size_in_chars) {
+ // Output was truncated, or written was negative.
+ return EINVAL;
+ }
+ return 0;
+}
+
+int _itow_s(int value, gurl_base::char16* buffer, size_t size_in_chars, int radix) {
+ if (radix != 10)
+ return EINVAL;
+
+ // No more than 12 characters will be required for a 32-bit integer.
+ // Add an extra byte for the terminating null.
+ char temp[13];
+ int written = snprintf(temp, sizeof(temp), "%d", value);
+ if (static_cast<size_t>(written) >= size_in_chars) {
+ // Output was truncated, or written was negative.
+ return EINVAL;
+ }
+
+ for (int i = 0; i < written; ++i) {
+ buffer[i] = static_cast<gurl_base::char16>(temp[i]);
+ }
+ buffer[written] = '\0';
+ return 0;
+}
+
+#endif // !WIN32
+
+} // namespace url
diff --git a/url/url_canon_internal.h b/url/url_canon_internal.h
new file mode 100644
index 0000000..e0c7567
--- /dev/null
+++ b/url/url_canon_internal.h
@@ -0,0 +1,445 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_INTERNAL_H_
+#define URL_URL_CANON_INTERNAL_H_
+
+// This file is intended to be included in another C++ file where the character
+// types are defined. This allows us to write mostly generic code, but not have
+// template bloat because everything is inlined when anybody calls any of our
+// functions.
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "polyfills/base/component_export.h"
+#include "polyfills/base/logging.h"
+#include "url/url_canon.h"
+
+namespace url {
+
+// Character type handling -----------------------------------------------------
+
+// Bits that identify different character types. These types identify different
+// bits that are set for each 8-bit character in the kSharedCharTypeTable.
+enum SharedCharTypes {
+ // Characters that do not require escaping in queries. Characters that do
+ // not have this flag will be escaped; see url_canon_query.cc
+ CHAR_QUERY = 1,
+
+ // Valid in the username/password field.
+ CHAR_USERINFO = 2,
+
+ // Valid in a IPv4 address (digits plus dot and 'x' for hex).
+ CHAR_IPV4 = 4,
+
+ // Valid in an ASCII-representation of a hex digit (as in %-escaped).
+ CHAR_HEX = 8,
+
+ // Valid in an ASCII-representation of a decimal digit.
+ CHAR_DEC = 16,
+
+ // Valid in an ASCII-representation of an octal digit.
+ CHAR_OCT = 32,
+
+ // Characters that do not require escaping in encodeURIComponent. Characters
+ // that do not have this flag will be escaped; see url_util.cc.
+ CHAR_COMPONENT = 64,
+};
+
+// This table contains the flags in SharedCharTypes for each 8-bit character.
+// Some canonicalization functions have their own specialized lookup table.
+// For those with simple requirements, we have collected the flags in one
+// place so there are fewer lookup tables to load into the CPU cache.
+//
+// Using an unsigned char type has a small but measurable performance benefit
+// over using a 32-bit number.
+extern const unsigned char kSharedCharTypeTable[0x100];
+
+// More readable wrappers around the character type lookup table.
+inline bool IsCharOfType(unsigned char c, SharedCharTypes type) {
+ return !!(kSharedCharTypeTable[c] & type);
+}
+inline bool IsQueryChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_QUERY);
+}
+inline bool IsIPv4Char(unsigned char c) {
+ return IsCharOfType(c, CHAR_IPV4);
+}
+inline bool IsHexChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_HEX);
+}
+inline bool IsComponentChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_COMPONENT);
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes.
+void AppendStringOfType(const char* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output);
+void AppendStringOfType(const gurl_base::char16* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output);
+
+// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
+// that will be used to represent it.
+COMPONENT_EXPORT(URL) extern const char kHexCharLookup[0x10];
+
+// This lookup table allows fast conversion between ASCII hex letters and their
+// corresponding numerical value. The 8-bit range is divided up into 8
+// regions of 0x20 characters each. Each of the three character types (numbers,
+// uppercase, lowercase) falls into different regions of this range. The table
+// contains the amount to subtract from characters in that range to get at
+// the corresponding numerical value.
+//
+// See HexDigitToValue for the lookup.
+extern const char kCharToHexLookup[8];
+
+// Assumes the input is a valid hex digit! Call IsHexChar before using this.
+inline unsigned char HexCharToValue(unsigned char c) {
+ return c - kCharToHexLookup[c / 0x20];
+}
+
+// Indicates if the given character is a dot or dot equivalent, returning the
+// number of characters taken by it. This will be one for a literal dot, 3 for
+// an escaped dot. If the character is not a dot, this will return 0.
+template<typename CHAR>
+inline int IsDot(const CHAR* spec, int offset, int end) {
+ if (spec[offset] == '.') {
+ return 1;
+ } else if (spec[offset] == '%' && offset + 3 <= end &&
+ spec[offset + 1] == '2' &&
+ (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) {
+ // Found "%2e"
+ return 3;
+ }
+ return 0;
+}
+
+// Returns the canonicalized version of the input character according to scheme
+// rules. This is implemented alongside the scheme canonicalizer, and is
+// required for relative URL resolving to test for scheme equality.
+//
+// Returns 0 if the input character is not a valid scheme character.
+char CanonicalSchemeChar(gurl_base::char16 ch);
+
+// Write a single character, escaped, to the output. This always escapes: it
+// does no checking that thee character requires escaping.
+// Escaping makes sense only 8 bit chars, so code works in all cases of
+// input parameters (8/16bit).
+template<typename UINCHAR, typename OUTCHAR>
+inline void AppendEscapedChar(UINCHAR ch,
+ CanonOutputT<OUTCHAR>* output) {
+ output->push_back('%');
+ output->push_back(kHexCharLookup[(ch >> 4) & 0xf]);
+ output->push_back(kHexCharLookup[ch & 0xf]);
+}
+
+// The character we'll substitute for undecodable or invalid characters.
+extern const gurl_base::char16 kUnicodeReplacementCharacter;
+
+// UTF-8 functions ------------------------------------------------------------
+
+// Reads one character in UTF-8 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-byte ASCII character, it will not be changed).
+COMPONENT_EXPORT(URL)
+bool ReadUTFChar(const char* str,
+ int* begin,
+ int length,
+ unsigned* code_point_out);
+
+// Generic To-UTF-8 converter. This will call the given append method for each
+// character that should be appended, with the given output method. Wrappers
+// are provided below for escaped and non-escaped versions of this.
+//
+// The char_value must have already been checked that it's a valid Unicode
+// character.
+template<class Output, void Appender(unsigned char, Output*)>
+inline void DoAppendUTF8(unsigned char_value, Output* output) {
+ if (char_value <= 0x7f) {
+ Appender(static_cast<unsigned char>(char_value), output);
+ } else if (char_value <= 0x7ff) {
+ // 110xxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else if (char_value <= 0xffff) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else if (char_value <= 0x10FFFF) { // Max Unicode code point.
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else {
+ // Invalid UTF-8 character (>20 bits).
+ GURL_NOTREACHED();
+ }
+}
+
+// Helper used by AppendUTF8Value below. We use an unsigned parameter so there
+// are no funny sign problems with the input, but then have to convert it to
+// a regular char for appending.
+inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) {
+ output->push_back(static_cast<char>(ch));
+}
+
+// Writes the given character to the output as UTF-8. This does NO checking
+// of the validity of the Unicode characters; the caller should ensure that
+// the value it is appending is valid to append.
+inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
+ DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
+}
+
+// Writes the given character to the output as UTF-8, escaping ALL
+// characters (even when they are ASCII). This does NO checking of the
+// validity of the Unicode characters; the caller should ensure that the value
+// it is appending is valid to append.
+inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
+ DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
+}
+
+// UTF-16 functions -----------------------------------------------------------
+
+// Reads one character in UTF-16 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-16-bit-word character, it will not be changed).
+COMPONENT_EXPORT(URL)
+bool ReadUTFChar(const gurl_base::char16* str,
+ int* begin,
+ int length,
+ unsigned* code_point_out);
+
+// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
+inline void AppendUTF16Value(unsigned code_point,
+ CanonOutputT<gurl_base::char16>* output) {
+ if (code_point > 0xffff) {
+ output->push_back(static_cast<gurl_base::char16>((code_point >> 10) + 0xd7c0));
+ output->push_back(static_cast<gurl_base::char16>((code_point & 0x3ff) | 0xdc00));
+ } else {
+ output->push_back(static_cast<gurl_base::char16>(code_point));
+ }
+}
+
+// Escaping functions ---------------------------------------------------------
+
+// Writes the given character to the output as UTF-8, escaped. Call this
+// function only when the input is wide. Returns true on success. Failure
+// means there was some problem with the encoding, we'll still try to
+// update the |*begin| pointer and add a placeholder character to the
+// output so processing can continue.
+//
+// We will append the character starting at ch[begin] with the buffer ch
+// being |length|. |*begin| will be updated to point to the last character
+// consumed (we may consume more than one for UTF-16) so that if called in
+// a loop, incrementing the pointer will move to the next character.
+//
+// Every single output character will be escaped. This means that if you
+// give it an ASCII character as input, it will be escaped. Some code uses
+// this when it knows that a character is invalid according to its rules
+// for validity. If you don't want escaping for ASCII characters, you will
+// have to filter them out prior to calling this function.
+//
+// Assumes that ch[begin] is within range in the array, but does not assume
+// that any following characters are.
+inline bool AppendUTF8EscapedChar(const gurl_base::char16* str, int* begin,
+ int length, CanonOutput* output) {
+ // UTF-16 input. ReadUTFChar will handle invalid characters for us and give
+ // us the kUnicodeReplacementCharacter, so we don't have to do special
+ // checking after failure, just pass through the failure to the caller.
+ unsigned char_value;
+ bool success = ReadUTFChar(str, begin, length, &char_value);
+ AppendUTF8EscapedValue(char_value, output);
+ return success;
+}
+
+// Handles UTF-8 input. See the wide version above for usage.
+inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length,
+ CanonOutput* output) {
+ // ReadUTF8Char will handle invalid characters for us and give us the
+ // kUnicodeReplacementCharacter, so we don't have to do special checking
+ // after failure, just pass through the failure to the caller.
+ unsigned ch;
+ bool success = ReadUTFChar(str, begin, length, &ch);
+ AppendUTF8EscapedValue(ch, output);
+ return success;
+}
+
+// Given a '%' character at |*begin| in the string |spec|, this will decode
+// the escaped value and put it into |*unescaped_value| on success (returns
+// true). On failure, this will return false, and will not write into
+// |*unescaped_value|.
+//
+// |*begin| will be updated to point to the last character of the escape
+// sequence so that when called with the index of a for loop, the next time
+// through it will point to the next character to be considered. On failure,
+// |*begin| will be unchanged.
+inline bool Is8BitChar(char c) {
+ return true; // this case is specialized to avoid a warning
+}
+inline bool Is8BitChar(gurl_base::char16 c) {
+ return c <= 255;
+}
+
+template<typename CHAR>
+inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
+ unsigned char* unescaped_value) {
+ if (*begin + 3 > end ||
+ !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
+ // Invalid escape sequence because there's not enough room, or the
+ // digits are not ASCII.
+ return false;
+ }
+
+ unsigned char first = static_cast<unsigned char>(spec[*begin + 1]);
+ unsigned char second = static_cast<unsigned char>(spec[*begin + 2]);
+ if (!IsHexChar(first) || !IsHexChar(second)) {
+ // Invalid hex digits, fail.
+ return false;
+ }
+
+ // Valid escape sequence.
+ *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second);
+ *begin += 2;
+ return true;
+}
+
+// Appends the given substring to the output, escaping "some" characters that
+// it feels may not be safe. It assumes the input values are all contained in
+// 8-bit although it allows any type.
+//
+// This is used in error cases to append invalid output so that it looks
+// approximately correct. Non-error cases should not call this function since
+// the escaping rules are not guaranteed!
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+ CanonOutput* output);
+void AppendInvalidNarrowString(const gurl_base::char16* spec, int begin, int end,
+ CanonOutput* output);
+
+// Misc canonicalization helpers ----------------------------------------------
+
+// Converts between UTF-8 and UTF-16, returning true on successful conversion.
+// The output will be appended to the given canonicalizer output (so make sure
+// it's empty if you want to replace).
+//
+// On invalid input, this will still write as much output as possible,
+// replacing the invalid characters with the "invalid character". It will
+// return false in the failure case, and the caller should not continue as
+// normal.
+COMPONENT_EXPORT(URL)
+bool ConvertUTF16ToUTF8(const gurl_base::char16* input,
+ int input_len,
+ CanonOutput* output);
+COMPONENT_EXPORT(URL)
+bool ConvertUTF8ToUTF16(const char* input,
+ int input_len,
+ CanonOutputT<gurl_base::char16>* output);
+
+// Converts from UTF-16 to 8-bit using the character set converter. If the
+// converter is NULL, this will use UTF-8.
+void ConvertUTF16ToQueryEncoding(const gurl_base::char16* input,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output);
+
+// Applies the replacements to the given component source. The component source
+// should be pre-initialized to the "old" base. That is, all pointers will
+// point to the spec of the old URL, and all of the Parsed components will
+// be indices into that string.
+//
+// The pointers and components in the |source| for all non-NULL strings in the
+// |repl| (replacements) will be updated to reference those strings.
+// Canonicalizing with the new |source| and |parsed| can then combine URL
+// components from many different strings.
+void SetupOverrideComponents(const char* base,
+ const Replacements<char>& repl,
+ URLComponentSource<char>* source,
+ Parsed* parsed);
+
+// Like the above 8-bit version, except that it additionally converts the
+// UTF-16 input to UTF-8 before doing the overrides.
+//
+// The given utf8_buffer is used to store the converted components. They will
+// be appended one after another, with the parsed structure identifying the
+// appropriate substrings. This buffer is a parameter because the source has
+// no storage, so the buffer must have the same lifetime as the source
+// parameter owned by the caller.
+//
+// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of
+// |source| will point into this buffer, which could be invalidated if
+// additional data is added and the CanonOutput resizes its buffer.
+//
+// Returns true on success. False means that the input was not valid UTF-16,
+// although we will have still done the override with "invalid characters" in
+// place of errors.
+bool SetupUTF16OverrideComponents(const char* base,
+ const Replacements<gurl_base::char16>& repl,
+ CanonOutput* utf8_buffer,
+ URLComponentSource<char>* source,
+ Parsed* parsed);
+
+// Implemented in url_canon_path.cc, these are required by the relative URL
+// resolver as well, so we declare them here.
+bool CanonicalizePartialPath(const char* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output);
+bool CanonicalizePartialPath(const gurl_base::char16* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output);
+
+#ifndef WIN32
+
+// Implementations of Windows' int-to-string conversions
+COMPONENT_EXPORT(URL)
+int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
+COMPONENT_EXPORT(URL)
+int _itow_s(int value, gurl_base::char16* buffer, size_t size_in_chars, int radix);
+
+// Secure template overloads for these functions
+template<size_t N>
+inline int _itoa_s(int value, char (&buffer)[N], int radix) {
+ return _itoa_s(value, buffer, N, radix);
+}
+
+template<size_t N>
+inline int _itow_s(int value, gurl_base::char16 (&buffer)[N], int radix) {
+ return _itow_s(value, buffer, N, radix);
+}
+
+// _strtoui64 and strtoull behave the same
+inline unsigned long long _strtoui64(const char* nptr,
+ char** endptr, int base) {
+ return strtoull(nptr, endptr, base);
+}
+
+#endif // WIN32
+
+} // namespace url
+
+#endif // URL_URL_CANON_INTERNAL_H_
diff --git a/url/url_canon_internal_file.h b/url/url_canon_internal_file.h
new file mode 100644
index 0000000..3b0a81e
--- /dev/null
+++ b/url/url_canon_internal_file.h
@@ -0,0 +1,135 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_INTERNAL_FILE_H_
+#define URL_URL_CANON_INTERNAL_FILE_H_
+
+// As with url_canon_internal.h, this file is intended to be included in
+// another C++ file where the template types are defined. This allows the
+// programmer to use this to use these functions for their own strings
+// types, without bloating the code by having inline templates used in
+// every call site.
+//
+// *** This file must be included after url_canon_internal as we depend on some
+// functions in it. ***
+
+#include "base/strings/string_util.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+static int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+ // (with backslashes instead of slashes as well).
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+ return begin; // Haven't consumed any characters
+
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+ // and that it is followed by a colon/pipe.
+
+ // Normalize Windows drive letters to uppercase
+ if (gurl_base::IsAsciiLower(spec[after_slashes]))
+ output->push_back(spec[after_slashes] - 'a' + 'A');
+ else
+ output->push_back(static_cast<char>(spec[after_slashes]));
+
+ // Normalize the character following it to a colon rather than pipe.
+ output->push_back(':');
+ output->push_back('/');
+ return after_slashes + 2;
+}
+
+// FileDoDriveSpec will have already added the first backslash, so we need to
+// write everything following the slashes using the path canonicalizer.
+template<typename CHAR, typename UCHAR>
+static void FileDoPath(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // Normalize the number of slashes after the drive letter. The path
+ // canonicalizer expects the input to begin in a slash already so
+ // doesn't check. We want to handle no-slashes
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ // Now use the regular path canonicalizer to canonicalize the rest of the
+ // path. We supply it with the path following the slashes. It won't prepend
+ // a slash because it assumes any nonempty path already starts with one.
+ // We explicitly filter out calls with no path here to prevent that case.
+ ParsedComponent sub_path(after_slashes, end - after_slashes);
+ if (sub_path.len > 0) {
+ // Give it a fake output component to write into. DoCanonicalizeFile will
+ // compute the full path component.
+ ParsedComponent fake_output_path;
+ URLCanonInternal<CHAR, UCHAR>::DoPath(
+ spec, sub_path, output, &fake_output_path);
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+ const ParsedURL& parsed,
+ CanonOutput* output,
+ ParsedURL* new_parsed) {
+ // Things we don't set in file: URLs.
+ new_parsed->username = ParsedComponent(0, -1);
+ new_parsed->password = ParsedComponent(0, -1);
+ new_parsed->port = ParsedComponent(0, -1);
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->push_back('f');
+ output->push_back('i');
+ output->push_back('l');
+ output->push_back('e');
+ new_parsed->scheme.len = output->length() - new_parsed->scheme.begin;
+ output->push_back(':');
+
+ // Write the separator for the host.
+ output->push_back('/');
+ output->push_back('/');
+
+ // Append the host. For many file URLs, this will be empty. For UNC, this
+ // will be present.
+ // TODO(brettw) This doesn't do any checking for host name validity. We
+ // should probably handle validity checking of UNC hosts differently than
+ // for regular IP hosts.
+ bool success = URLCanonInternal<CHAR, UCHAR>::DoHost(
+ source.host, parsed.host, output, &new_parsed->host);
+
+ // Write a separator for the start of the path. We'll ignore any slashes
+ // already at the beginning of the path.
+ new_parsed->path.begin = output->length();
+ output->push_back('/');
+
+ // Copy and normalize the "c:" at the beginning, if present.
+ int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
+ parsed.path.end(), output);
+
+ // Copy the rest of the path.
+ FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
+
+ // For things following the path, we can use the standard canonicalizers.
+ success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
+ source.query, parsed.query, output, &new_parsed->query);
+ success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
+ source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace url
+
+#endif // URL_URL_CANON_INTERNAL_FILE_H_
diff --git a/url/url_canon_ip.cc b/url/url_canon_ip.cc
new file mode 100644
index 0000000..f7c5700
--- /dev/null
+++ b/url/url_canon_ip.cc
@@ -0,0 +1,711 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_canon_ip.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <limits>
+
+#include "polyfills/base/logging.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+// Converts one of the character types that represent a numerical base to the
+// corresponding base.
+int BaseForType(SharedCharTypes type) {
+ switch (type) {
+ case CHAR_HEX:
+ return 16;
+ case CHAR_DEC:
+ return 10;
+ case CHAR_OCT:
+ return 8;
+ default:
+ return 0;
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoFindIPv4Components(const CHAR* spec,
+ const Component& host,
+ Component components[4]) {
+ if (!host.is_nonempty())
+ return false;
+
+ int cur_component = 0; // Index of the component we're working on.
+ int cur_component_begin = host.begin; // Start of the current component.
+ int end = host.end();
+ for (int i = host.begin; /* nothing */; i++) {
+ if (i >= end || spec[i] == '.') {
+ // Found the end of the current component.
+ int component_len = i - cur_component_begin;
+ components[cur_component] = Component(cur_component_begin, component_len);
+
+ // The next component starts after the dot.
+ cur_component_begin = i + 1;
+ cur_component++;
+
+ // Don't allow empty components (two dots in a row), except we may
+ // allow an empty component at the end (this would indicate that the
+ // input ends in a dot). We also want to error if the component is
+ // empty and it's the only component (cur_component == 1).
+ if (component_len == 0 && (i < end || cur_component == 1))
+ return false;
+
+ if (i >= end)
+ break; // End of the input.
+
+ if (cur_component == 4) {
+ // Anything else after the 4th component is an error unless it is a
+ // dot that would otherwise be treated as the end of input.
+ if (spec[i] == '.' && i + 1 == end)
+ break;
+ return false;
+ }
+ } else if (static_cast<UCHAR>(spec[i]) >= 0x80 ||
+ !IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+ // Invalid character for an IPv4 address.
+ return false;
+ }
+ }
+
+ // Fill in any unused components.
+ while (cur_component < 4)
+ components[cur_component++] = Component();
+ return true;
+}
+
+// Converts an IPv4 component to a 32-bit number, while checking for overflow.
+//
+// Possible return values:
+// - IPV4 - The number was valid, and did not overflow.
+// - BROKEN - The input was numeric, but too large for a 32-bit field.
+// - NEUTRAL - Input was not numeric.
+//
+// The input is assumed to be ASCII. FindIPv4Components should have stripped
+// out any input that is greater than 7 bits. The components are assumed
+// to be non-empty.
+template<typename CHAR>
+CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec,
+ const Component& component,
+ uint32_t* number) {
+ // Figure out the base
+ SharedCharTypes base;
+ int base_prefix_len = 0; // Size of the prefix for this base.
+ if (spec[component.begin] == '0') {
+ // Either hex or dec, or a standalone zero.
+ if (component.len == 1) {
+ base = CHAR_DEC;
+ } else if (spec[component.begin + 1] == 'X' ||
+ spec[component.begin + 1] == 'x') {
+ base = CHAR_HEX;
+ base_prefix_len = 2;
+ } else {
+ base = CHAR_OCT;
+ base_prefix_len = 1;
+ }
+ } else {
+ base = CHAR_DEC;
+ }
+
+ // Extend the prefix to consume all leading zeros.
+ while (base_prefix_len < component.len &&
+ spec[component.begin + base_prefix_len] == '0')
+ base_prefix_len++;
+
+ // Put the component, minus any base prefix, into a NULL-terminated buffer so
+ // we can call the standard library. Because leading zeros have already been
+ // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
+ // overflow check.
+ const int kMaxComponentLen = 16;
+ char buf[kMaxComponentLen + 1]; // digits + '\0'
+ int dest_i = 0;
+ for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
+ // We know the input is 7-bit, so convert to narrow (if this is the wide
+ // version of the template) by casting.
+ char input = static_cast<char>(spec[i]);
+
+ // Validate that this character is OK for the given base.
+ if (!IsCharOfType(input, base))
+ return CanonHostInfo::NEUTRAL;
+
+ // Fill the buffer, if there's space remaining. This check allows us to
+ // verify that all characters are numeric, even those that don't fit.
+ if (dest_i < kMaxComponentLen)
+ buf[dest_i++] = input;
+ }
+
+ buf[dest_i] = '\0';
+
+ // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
+ // number can overflow a 64-bit number in <= 16 characters).
+ uint64_t num = _strtoui64(buf, NULL, BaseForType(base));
+
+ // Check for 32-bit overflow.
+ if (num > std::numeric_limits<uint32_t>::max())
+ return CanonHostInfo::BROKEN;
+
+ // No overflow. Success!
+ *number = static_cast<uint32_t>(num);
+ return CanonHostInfo::IPV4;
+}
+
+// See declaration of IPv4AddressToNumber for documentation.
+template<typename CHAR>
+CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
+ const Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ // The identified components. Not all may exist.
+ Component components[4];
+ if (!FindIPv4Components(spec, host, components))
+ return CanonHostInfo::NEUTRAL;
+
+ // Convert existing components to digits. Values up to
+ // |existing_components| will be valid.
+ uint32_t component_values[4];
+ int existing_components = 0;
+
+ // Set to true if one or more components are BROKEN. BROKEN is only
+ // returned if all components are IPV4 or BROKEN, so, for example,
+ // 12345678912345.de returns NEUTRAL rather than broken.
+ bool broken = false;
+ for (int i = 0; i < 4; i++) {
+ if (components[i].len <= 0)
+ continue;
+ CanonHostInfo::Family family = IPv4ComponentToNumber(
+ spec, components[i], &component_values[existing_components]);
+
+ if (family == CanonHostInfo::BROKEN) {
+ broken = true;
+ } else if (family != CanonHostInfo::IPV4) {
+ // Stop if we hit a non-BROKEN invalid non-empty component.
+ return family;
+ }
+
+ existing_components++;
+ }
+
+ if (broken)
+ return CanonHostInfo::BROKEN;
+
+ // Use that sequence of numbers to fill out the 4-component IP address.
+
+ // First, process all components but the last, while making sure each fits
+ // within an 8-bit field.
+ for (int i = 0; i < existing_components - 1; i++) {
+ if (component_values[i] > std::numeric_limits<uint8_t>::max())
+ return CanonHostInfo::BROKEN;
+ address[i] = static_cast<unsigned char>(component_values[i]);
+ }
+
+ // Next, consume the last component to fill in the remaining bytes.
+ // Work around a gcc 4.9 bug. crbug.com/392872
+#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+ uint32_t last_value = component_values[existing_components - 1];
+#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
+#pragma GCC diagnostic pop
+#endif
+ for (int i = 3; i >= existing_components - 1; i--) {
+ address[i] = static_cast<unsigned char>(last_value);
+ last_value >>= 8;
+ }
+
+ // If the last component has residual bits, report overflow.
+ if (last_value != 0)
+ return CanonHostInfo::BROKEN;
+
+ // Tell the caller how many components we saw.
+ *num_ipv4_components = existing_components;
+
+ // Success!
+ return CanonHostInfo::IPV4;
+}
+
+// Return true if we've made a final IPV4/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv4Address(const CHAR* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ host_info->family = IPv4AddressToNumber(
+ spec, host, host_info->address, &host_info->num_ipv4_components);
+
+ switch (host_info->family) {
+ case CanonHostInfo::IPV4:
+ // Definitely an IPv4 address.
+ host_info->out_host.begin = output->length();
+ AppendIPv4Address(host_info->address, output);
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
+ return true;
+ case CanonHostInfo::BROKEN:
+ // Definitely broken.
+ return true;
+ default:
+ // Could be IPv6 or a hostname.
+ return false;
+ }
+}
+
+// Helper class that describes the main components of an IPv6 input string.
+// See the following examples to understand how it breaks up an input string:
+//
+// [Example 1]: input = "[::aa:bb]"
+// ==> num_hex_components = 2
+// ==> hex_components[0] = Component(3,2) "aa"
+// ==> hex_components[1] = Component(6,2) "bb"
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 2]: input = "[1:2::3:4:5]"
+// ==> num_hex_components = 5
+// ==> hex_components[0] = Component(1,1) "1"
+// ==> hex_components[1] = Component(3,1) "2"
+// ==> hex_components[2] = Component(6,1) "3"
+// ==> hex_components[3] = Component(8,1) "4"
+// ==> hex_components[4] = Component(10,1) "5"
+// ==> index_of_contraction = 2
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 3]: input = "[::ffff:192.168.0.1]"
+// ==> num_hex_components = 1
+// ==> hex_components[0] = Component(3,4) "ffff"
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+// [Example 4]: input = "[1::]"
+// ==> num_hex_components = 1
+// ==> hex_components[0] = Component(1,1) "1"
+// ==> index_of_contraction = 1
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 5]: input = "[::192.168.0.1]"
+// ==> num_hex_components = 0
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+struct IPv6Parsed {
+ // Zero-out the parse information.
+ void reset() {
+ num_hex_components = 0;
+ index_of_contraction = -1;
+ ipv4_component.reset();
+ }
+
+ // There can be up to 8 hex components (colon separated) in the literal.
+ Component hex_components[8];
+
+ // The count of hex components present. Ranges from [0,8].
+ int num_hex_components;
+
+ // The index of the hex component that the "::" contraction precedes, or
+ // -1 if there is no contraction.
+ int index_of_contraction;
+
+ // The range of characters which are an IPv4 literal.
+ Component ipv4_component;
+};
+
+// Parse the IPv6 input string. If parsing succeeded returns true and fills
+// |parsed| with the information. If parsing failed (because the input is
+// invalid) returns false.
+template<typename CHAR, typename UCHAR>
+bool DoParseIPv6(const CHAR* spec, const Component& host, IPv6Parsed* parsed) {
+ // Zero-out the info.
+ parsed->reset();
+
+ if (!host.is_nonempty())
+ return false;
+
+ // The index for start and end of address range (no brackets).
+ int begin = host.begin;
+ int end = host.end();
+
+ int cur_component_begin = begin; // Start of the current component.
+
+ // Scan through the input, searching for hex components, "::" contractions,
+ // and IPv4 components.
+ for (int i = begin; /* i <= end */; i++) {
+ bool is_colon = spec[i] == ':';
+ bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
+
+ // We reached the end of the current component if we encounter a colon
+ // (separator between hex components, or start of a contraction), or end of
+ // input.
+ if (is_colon || i == end) {
+ int component_len = i - cur_component_begin;
+
+ // A component should not have more than 4 hex digits.
+ if (component_len > 4)
+ return false;
+
+ // Don't allow empty components.
+ if (component_len == 0) {
+ // The exception is when contractions appear at beginning of the
+ // input or at the end of the input.
+ if (!((is_contraction && i == begin) || (i == end &&
+ parsed->index_of_contraction == parsed->num_hex_components)))
+ return false;
+ }
+
+ // Add the hex component we just found to running list.
+ if (component_len > 0) {
+ // Can't have more than 8 components!
+ if (parsed->num_hex_components >= 8)
+ return false;
+
+ parsed->hex_components[parsed->num_hex_components++] =
+ Component(cur_component_begin, component_len);
+ }
+ }
+
+ if (i == end)
+ break; // Reached the end of the input, DONE.
+
+ // We found a "::" contraction.
+ if (is_contraction) {
+ // There can be at most one contraction in the literal.
+ if (parsed->index_of_contraction != -1)
+ return false;
+ parsed->index_of_contraction = parsed->num_hex_components;
+ ++i; // Consume the colon we peeked.
+ }
+
+ if (is_colon) {
+ // Colons are separators between components, keep track of where the
+ // current component started (after this colon).
+ cur_component_begin = i + 1;
+ } else {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ return false; // Not ASCII.
+
+ if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
+ // Regular components are hex numbers. It is also possible for
+ // a component to be an IPv4 address in dotted form.
+ if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+ // Since IPv4 address can only appear at the end, assume the rest
+ // of the string is an IPv4 address. (We will parse this separately
+ // later).
+ parsed->ipv4_component =
+ Component(cur_component_begin, end - cur_component_begin);
+ break;
+ } else {
+ // The character was neither a hex digit, nor an IPv4 character.
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+// Verifies the parsed IPv6 information, checking that the various components
+// add up to the right number of bits (hex components are 16 bits, while
+// embedded IPv4 formats are 32 bits, and contractions are placeholdes for
+// 16 or more bits). Returns true if sizes match up, false otherwise. On
+// success writes the length of the contraction (if any) to
+// |out_num_bytes_of_contraction|.
+bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
+ int* out_num_bytes_of_contraction) {
+ // Each group of four hex digits contributes 16 bits.
+ int num_bytes_without_contraction = parsed.num_hex_components * 2;
+
+ // If an IPv4 address was embedded at the end, it contributes 32 bits.
+ if (parsed.ipv4_component.is_valid())
+ num_bytes_without_contraction += 4;
+
+ // If there was a "::" contraction, its size is going to be:
+ // MAX([16bits], [128bits] - num_bytes_without_contraction).
+ int num_bytes_of_contraction = 0;
+ if (parsed.index_of_contraction != -1) {
+ num_bytes_of_contraction = 16 - num_bytes_without_contraction;
+ if (num_bytes_of_contraction < 2)
+ num_bytes_of_contraction = 2;
+ }
+
+ // Check that the numbers add up.
+ if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
+ return false;
+
+ *out_num_bytes_of_contraction = num_bytes_of_contraction;
+ return true;
+}
+
+// Converts a hex component into a number. This cannot fail since the caller has
+// already verified that each character in the string was a hex digit, and
+// that there were no more than 4 characters.
+template <typename CHAR>
+uint16_t IPv6HexComponentToNumber(const CHAR* spec,
+ const Component& component) {
+ GURL_DCHECK(component.len <= 4);
+
+ // Copy the hex string into a C-string.
+ char buf[5];
+ for (int i = 0; i < component.len; ++i)
+ buf[i] = static_cast<char>(spec[component.begin + i]);
+ buf[component.len] = '\0';
+
+ // Convert it to a number (overflow is not possible, since with 4 hex
+ // characters we can at most have a 16 bit number).
+ return static_cast<uint16_t>(_strtoui64(buf, NULL, 16));
+}
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+template<typename CHAR, typename UCHAR>
+bool DoIPv6AddressToNumber(const CHAR* spec,
+ const Component& host,
+ unsigned char address[16]) {
+ // Make sure the component is bounded by '[' and ']'.
+ int end = host.end();
+ if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
+ return false;
+
+ // Exclude the square brackets.
+ Component ipv6_comp(host.begin + 1, host.len - 2);
+
+ // Parse the IPv6 address -- identify where all the colon separated hex
+ // components are, the "::" contraction, and the embedded IPv4 address.
+ IPv6Parsed ipv6_parsed;
+ if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
+ return false;
+
+ // Do some basic size checks to make sure that the address doesn't
+ // specify more than 128 bits or fewer than 128 bits. This also resolves
+ // how may zero bytes the "::" contraction represents.
+ int num_bytes_of_contraction;
+ if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
+ return false;
+
+ int cur_index_in_address = 0;
+
+ // Loop through each hex components, and contraction in order.
+ for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
+ // Append the contraction if it appears before this component.
+ if (i == ipv6_parsed.index_of_contraction) {
+ for (int j = 0; j < num_bytes_of_contraction; ++j)
+ address[cur_index_in_address++] = 0;
+ }
+ // Append the hex component's value.
+ if (i != ipv6_parsed.num_hex_components) {
+ // Get the 16-bit value for this hex component.
+ uint16_t number = IPv6HexComponentToNumber<CHAR>(
+ spec, ipv6_parsed.hex_components[i]);
+ // Append to |address|, in network byte order.
+ address[cur_index_in_address++] = (number & 0xFF00) >> 8;
+ address[cur_index_in_address++] = (number & 0x00FF);
+ }
+ }
+
+ // If there was an IPv4 section, convert it into a 32-bit number and append
+ // it to |address|.
+ if (ipv6_parsed.ipv4_component.is_valid()) {
+ // Append the 32-bit number to |address|.
+ int ignored_num_ipv4_components;
+ if (CanonHostInfo::IPV4 !=
+ IPv4AddressToNumber(spec,
+ ipv6_parsed.ipv4_component,
+ &address[cur_index_in_address],
+ &ignored_num_ipv4_components))
+ return false;
+ }
+
+ return true;
+}
+
+// Searches for the longest sequence of zeros in |address|, and writes the
+// range into |contraction_range|. The run of zeros must be at least 16 bits,
+// and if there is a tie the first is chosen.
+void ChooseIPv6ContractionRange(const unsigned char address[16],
+ Component* contraction_range) {
+ // The longest run of zeros in |address| seen so far.
+ Component max_range;
+
+ // The current run of zeros in |address| being iterated over.
+ Component cur_range;
+
+ for (int i = 0; i < 16; i += 2) {
+ // Test for 16 bits worth of zero.
+ bool is_zero = (address[i] == 0 && address[i + 1] == 0);
+
+ if (is_zero) {
+ // Add the zero to the current range (or start a new one).
+ if (!cur_range.is_valid())
+ cur_range = Component(i, 0);
+ cur_range.len += 2;
+ }
+
+ if (!is_zero || i == 14) {
+ // Just completed a run of zeros. If the run is greater than 16 bits,
+ // it is a candidate for the contraction.
+ if (cur_range.len > 2 && cur_range.len > max_range.len) {
+ max_range = cur_range;
+ }
+ cur_range.reset();
+ }
+ }
+ *contraction_range = max_range;
+}
+
+// Return true if we've made a final IPV6/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv6Address(const CHAR* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ // Turn the IP address into a 128 bit number.
+ if (!IPv6AddressToNumber(spec, host, host_info->address)) {
+ // If it's not an IPv6 address, scan for characters that should *only*
+ // exist in an IPv6 address.
+ for (int i = host.begin; i < host.end(); i++) {
+ switch (spec[i]) {
+ case '[':
+ case ']':
+ case ':':
+ host_info->family = CanonHostInfo::BROKEN;
+ return true;
+ }
+ }
+
+ // No invalid characters. Could still be IPv4 or a hostname.
+ host_info->family = CanonHostInfo::NEUTRAL;
+ return false;
+ }
+
+ host_info->out_host.begin = output->length();
+ output->push_back('[');
+ AppendIPv6Address(host_info->address, output);
+ output->push_back(']');
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
+
+ host_info->family = CanonHostInfo::IPV6;
+ return true;
+}
+
+} // namespace
+
+void AppendIPv4Address(const unsigned char address[4], CanonOutput* output) {
+ for (int i = 0; i < 4; i++) {
+ char str[16];
+ _itoa_s(address[i], str, 10);
+
+ for (int ch = 0; str[ch] != 0; ch++)
+ output->push_back(str[ch]);
+
+ if (i != 3)
+ output->push_back('.');
+ }
+}
+
+void AppendIPv6Address(const unsigned char address[16], CanonOutput* output) {
+ // We will output the address according to the rules in:
+ // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
+
+ // Start by finding where to place the "::" contraction (if any).
+ Component contraction_range;
+ ChooseIPv6ContractionRange(address, &contraction_range);
+
+ for (int i = 0; i <= 14;) {
+ // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
+ GURL_DCHECK(i % 2 == 0);
+ if (i == contraction_range.begin && contraction_range.len > 0) {
+ // Jump over the contraction.
+ if (i == 0)
+ output->push_back(':');
+ output->push_back(':');
+ i = contraction_range.end();
+ } else {
+ // Consume the next 16 bits from |address|.
+ int x = address[i] << 8 | address[i + 1];
+
+ i += 2;
+
+ // Stringify the 16 bit number (at most requires 4 hex digits).
+ char str[5];
+ _itoa_s(x, str, 16);
+ for (int ch = 0; str[ch] != 0; ++ch)
+ output->push_back(str[ch]);
+
+ // Put a colon after each number, except the last.
+ if (i < 16)
+ output->push_back(':');
+ }
+ }
+}
+
+bool FindIPv4Components(const char* spec,
+ const Component& host,
+ Component components[4]) {
+ return DoFindIPv4Components<char, unsigned char>(spec, host, components);
+}
+
+bool FindIPv4Components(const gurl_base::char16* spec,
+ const Component& host,
+ Component components[4]) {
+ return DoFindIPv4Components<gurl_base::char16, gurl_base::char16>(
+ spec, host, components);
+}
+
+void CanonicalizeIPAddress(const char* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (DoCanonicalizeIPv4Address<char, unsigned char>(
+ spec, host, output, host_info))
+ return;
+ if (DoCanonicalizeIPv6Address<char, unsigned char>(
+ spec, host, output, host_info))
+ return;
+}
+
+void CanonicalizeIPAddress(const gurl_base::char16* spec,
+ const Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (DoCanonicalizeIPv4Address<gurl_base::char16, gurl_base::char16>(
+ spec, host, output, host_info))
+ return;
+ if (DoCanonicalizeIPv6Address<gurl_base::char16, gurl_base::char16>(
+ spec, host, output, host_info))
+ return;
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
+ const Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components);
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const gurl_base::char16* spec,
+ const Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ return DoIPv4AddressToNumber<gurl_base::char16>(
+ spec, host, address, num_ipv4_components);
+}
+
+bool IPv6AddressToNumber(const char* spec,
+ const Component& host,
+ unsigned char address[16]) {
+ return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
+}
+
+bool IPv6AddressToNumber(const gurl_base::char16* spec,
+ const Component& host,
+ unsigned char address[16]) {
+ return DoIPv6AddressToNumber<gurl_base::char16, gurl_base::char16>(spec, host, address);
+}
+
+} // namespace url
diff --git a/url/url_canon_ip.h b/url/url_canon_ip.h
new file mode 100644
index 0000000..5d93f28
--- /dev/null
+++ b/url/url_canon_ip.h
@@ -0,0 +1,88 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_IP_H_
+#define URL_URL_CANON_IP_H_
+
+#include "polyfills/base/component_export.h"
+#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+
+namespace url {
+
+// Writes the given IPv4 address to |output|.
+COMPONENT_EXPORT(URL)
+void AppendIPv4Address(const unsigned char address[4], CanonOutput* output);
+
+// Writes the given IPv6 address to |output|.
+COMPONENT_EXPORT(URL)
+void AppendIPv6Address(const unsigned char address[16], CanonOutput* output);
+
+// Searches the host name for the portions of the IPv4 address. On success,
+// each component will be placed into |components| and it will return true.
+// It will return false if the host can not be separated as an IPv4 address
+// or if there are any non-7-bit characters or other characters that can not
+// be in an IP address. (This is important so we fail as early as possible for
+// common non-IP hostnames.)
+//
+// Not all components may exist. If there are only 3 components, for example,
+// the last one will have a length of -1 or 0 to indicate it does not exist.
+//
+// Note that many platforms' inet_addr will ignore everything after a space
+// in certain circumstances if the stuff before the space looks like an IP
+// address. IE6 is included in this. We do NOT handle this case. In many cases,
+// the browser's canonicalization will get run before this which converts
+// spaces to %20 (in the case of IE7) or rejects them (in the case of Mozilla),
+// so this code path never gets hit. Our host canonicalization will notice
+// these spaces and escape them, which will make IP address finding fail. This
+// seems like better behavior than stripping after a space.
+COMPONENT_EXPORT(URL)
+bool FindIPv4Components(const char* spec,
+ const Component& host,
+ Component components[4]);
+COMPONENT_EXPORT(URL)
+bool FindIPv4Components(const gurl_base::char16* spec,
+ const Component& host,
+ Component components[4]);
+
+// Converts an IPv4 address to a 32-bit number (network byte order).
+//
+// Possible return values:
+// IPV4 - IPv4 address was successfully parsed.
+// BROKEN - Input was formatted like an IPv4 address, but overflow occurred
+// during parsing.
+// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
+// It might be an IPv6 address, or a hostname.
+//
+// On success, |num_ipv4_components| will be populated with the number of
+// components in the IPv4 address.
+COMPONENT_EXPORT(URL)
+CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
+ const Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
+COMPONENT_EXPORT(URL)
+CanonHostInfo::Family IPv4AddressToNumber(const gurl_base::char16* spec,
+ const Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+//
+// NOTE that |host| is expected to be surrounded by square brackets.
+// i.e. "[::1]" rather than "::1".
+COMPONENT_EXPORT(URL)
+bool IPv6AddressToNumber(const char* spec,
+ const Component& host,
+ unsigned char address[16]);
+COMPONENT_EXPORT(URL)
+bool IPv6AddressToNumber(const gurl_base::char16* spec,
+ const Component& host,
+ unsigned char address[16]);
+
+} // namespace url
+
+#endif // URL_URL_CANON_IP_H_
diff --git a/url/url_canon_mailtourl.cc b/url/url_canon_mailtourl.cc
new file mode 100644
index 0000000..f09faa7
--- /dev/null
+++ b/url/url_canon_mailtourl.cc
@@ -0,0 +1,127 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "mailto:" URLs.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+namespace {
+
+// Certain characters should be percent-encoded when they appear in the path
+// component of a mailto URL, to improve compatibility and mitigate against
+// command-injection attacks on mailto handlers. See https://crbug.com/711020.
+template <typename UCHAR>
+bool ShouldEncodeMailboxCharacter(UCHAR uch) {
+ if (uch < 0x21 || // space & control characters.
+ uch > 0x7e || // high-ascii characters.
+ uch == 0x22 || // quote.
+ uch == 0x3c || uch == 0x3e || // angle brackets.
+ uch == 0x60 || // backtick.
+ uch == 0x7b || uch == 0x7c || uch == 0x7d // braces and pipe.
+ ) {
+ return true;
+ }
+ return false;
+}
+
+template <typename CHAR, typename UCHAR>
+bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // mailto: only uses {scheme, path, query} -- clear the rest.
+ new_parsed->username = Component();
+ new_parsed->password = Component();
+ new_parsed->host = Component();
+ new_parsed->port = Component();
+ new_parsed->ref = Component();
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("mailto:", 7);
+ new_parsed->scheme.len = 6;
+
+ bool success = true;
+
+ // Path
+ if (parsed.path.is_valid()) {
+ new_parsed->path.begin = output->length();
+
+ // Copy the path using path URL's more lax escaping rules.
+ // We convert to UTF-8 and escape non-ASCII, but leave most
+ // ASCII characters alone.
+ int end = parsed.path.end();
+ for (int i = parsed.path.begin; i < end; ++i) {
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
+ if (ShouldEncodeMailboxCharacter<UCHAR>(uch))
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+ else
+ output->push_back(static_cast<char>(uch));
+ }
+
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
+ } else {
+ // No path at all
+ new_parsed->path.reset();
+ }
+
+ // Query -- always use the default UTF8 charset converter.
+ CanonicalizeQuery(source.query, parsed.query, NULL,
+ output, &new_parsed->query);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizeMailtoURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeMailtoURL<gurl_base::char16, gurl_base::char16>(
+ URLComponentSource<gurl_base::char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc
new file mode 100644
index 0000000..ee18aa2
--- /dev/null
+++ b/url/url_canon_path.cc
@@ -0,0 +1,437 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <limits.h>
+
+#include "polyfills/base/logging.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+namespace {
+
+enum CharacterFlags {
+ // Pass through unchanged, whether escaped or unescaped. This doesn't
+ // actually set anything so you can't OR it to check, it's just to make the
+ // table below more clear when neither ESCAPE or UNESCAPE is set.
+ PASS = 0,
+
+ // This character requires special handling in DoPartialPath. Doing this test
+ // first allows us to filter out the common cases of regular characters that
+ // can be directly copied.
+ SPECIAL = 1,
+
+ // This character must be escaped in the canonical output. Note that all
+ // escaped chars also have the "special" bit set so that the code that looks
+ // for this is triggered. Not valid with PASS or ESCAPE
+ ESCAPE_BIT = 2,
+ ESCAPE = ESCAPE_BIT | SPECIAL,
+
+ // This character must be unescaped in canonical output. Not valid with
+ // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
+ // characters unescaped, they should just be copied.
+ UNESCAPE = 4,
+
+ // This character is disallowed in URLs. Note that the "special" bit is also
+ // set to trigger handling.
+ INVALID_BIT = 8,
+ INVALID = INVALID_BIT | SPECIAL,
+};
+
+// This table contains one of the above flag values. Note some flags are more
+// than one bits because they also turn on the "special" flag. Special is the
+// only flag that may be combined with others.
+//
+// This table is designed to match exactly what IE does with the characters.
+//
+// Dot is even more special, and the escaped version is handled specially by
+// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
+// bit is never handled (we just need the "special") bit.
+const unsigned char kPathCharLookup[0x100] = {
+// NULL control chars...
+ INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+// control chars...
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE,
+// @ A B C D E F G H I J K L M N O
+ PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE,
+// ` a b c d e f g h i j k l m n o
+ ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+// p q r s t u v w x y z { | } ~ <NBSP>
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE,
+// ...all the high-bit characters are escaped
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE};
+
+enum DotDisposition {
+ // The given dot is just part of a filename and is not special.
+ NOT_A_DIRECTORY,
+
+ // The given dot is the current directory.
+ DIRECTORY_CUR,
+
+ // The given dot is the first of a double dot that should take us up one.
+ DIRECTORY_UP
+};
+
+// When the path resolver finds a dot, this function is called with the
+// character following that dot to see what it is. The return value
+// indicates what type this dot is (see above). This code handles the case
+// where the dot is at the end of the input.
+//
+// |*consumed_len| will contain the number of characters in the input that
+// express what we found.
+//
+// If the input is "../foo", |after_dot| = 1, |end| = 6, and
+// at the end, |*consumed_len| = 2 for the "./" this function consumed. The
+// original dot length should be handled by the caller.
+template<typename CHAR>
+DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
+ int end, int* consumed_len) {
+ if (after_dot == end) {
+ // Single dot at the end.
+ *consumed_len = 0;
+ return DIRECTORY_CUR;
+ }
+ if (IsURLSlash(spec[after_dot])) {
+ // Single dot followed by a slash.
+ *consumed_len = 1; // Consume the slash
+ return DIRECTORY_CUR;
+ }
+
+ int second_dot_len = IsDot(spec, after_dot, end);
+ if (second_dot_len) {
+ int after_second_dot = after_dot + second_dot_len;
+ if (after_second_dot == end) {
+ // Double dot at the end.
+ *consumed_len = second_dot_len;
+ return DIRECTORY_UP;
+ }
+ if (IsURLSlash(spec[after_second_dot])) {
+ // Double dot followed by a slash.
+ *consumed_len = second_dot_len + 1;
+ return DIRECTORY_UP;
+ }
+ }
+
+ // The dots are followed by something else, not a directory.
+ *consumed_len = 0;
+ return NOT_A_DIRECTORY;
+}
+
+// Rewinds the output to the previous slash. It is assumed that the output
+// ends with a slash and this doesn't count (we call this when we are
+// appending directory paths, so the previous path component has and ending
+// slash).
+//
+// This will stop at the first slash (assumed to be at position
+// |path_begin_in_output| and not go any higher than that. Some web pages
+// do ".." too many times, so we need to handle that brokenness.
+//
+// It searches for a literal slash rather than including a backslash as well
+// because it is run only on the canonical output.
+//
+// The output is guaranteed to end in a slash when this function completes.
+void BackUpToPreviousSlash(int path_begin_in_output,
+ CanonOutput* output) {
+ GURL_DCHECK(output->length() > 0);
+
+ int i = output->length() - 1;
+ GURL_DCHECK(output->at(i) == '/');
+ if (i == path_begin_in_output)
+ return; // We're at the first slash, nothing to do.
+
+ // Now back up (skipping the trailing slash) until we find another slash.
+ i--;
+ while (output->at(i) != '/' && i > path_begin_in_output)
+ i--;
+
+ // Now shrink the output to just include that last slash we found.
+ output->set_length(i + 1);
+}
+
+// Looks for problematic nested escape sequences and escapes the output as
+// needed to ensure they can't be misinterpreted.
+//
+// Our concern is that in input escape sequence that's invalid because it
+// contains nested escape sequences might look valid once those are unescaped.
+// For example, "%%300" is not a valid escape sequence, but after unescaping the
+// inner "%30" this becomes "%00" which is valid. Leaving this in the output
+// string can result in callers re-canonicalizing the string and unescaping this
+// sequence, thus resulting in something fundamentally different than the
+// original input here. This can cause a variety of problems.
+//
+// This function is called after we've just unescaped a sequence that's within
+// two output characters of a previous '%' that we know didn't begin a valid
+// escape sequence in the input string. We look for whether the output is going
+// to turn into a valid escape sequence, and if so, convert the initial '%' into
+// an escaped "%25" so the output can't be misinterpreted.
+//
+// |spec| is the input string we're canonicalizing.
+// |next_input_index| is the index of the next unprocessed character in |spec|.
+// |input_len| is the length of |spec|.
+// |last_invalid_percent_index| is the index in |output| of a previously-seen
+// '%' character. The caller knows this '%' character isn't followed by a valid
+// escape sequence in the input string.
+// |output| is the canonicalized output thus far. The caller guarantees this
+// ends with a '%' followed by one or two characters, and the '%' is the one
+// pointed to by |last_invalid_percent_index|. The last character in the string
+// was just unescaped.
+template<typename CHAR>
+void CheckForNestedEscapes(const CHAR* spec,
+ int next_input_index,
+ int input_len,
+ int last_invalid_percent_index,
+ CanonOutput* output) {
+ const int length = output->length();
+ const char last_unescaped_char = output->at(length - 1);
+
+ // If |output| currently looks like "%c", we need to try appending the next
+ // input character to see if this will result in a problematic escape
+ // sequence. Note that this won't trigger on the first nested escape of a
+ // two-escape sequence like "%%30%30" -- we'll allow the conversion to
+ // "%0%30" -- but the second nested escape will be caught by this function
+ // when it's called again in that case.
+ const bool append_next_char = last_invalid_percent_index == length - 2;
+ if (append_next_char) {
+ // If the input doesn't contain a 7-bit character next, this case won't be a
+ // problem.
+ if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80))
+ return;
+ output->push_back(static_cast<char>(spec[next_input_index]));
+ }
+
+ // Now output ends like "%cc". Try to unescape this.
+ int begin = last_invalid_percent_index;
+ unsigned char temp;
+ if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) {
+ // New escape sequence found. Overwrite the characters following the '%'
+ // with "25", and push_back() the one or two characters that were following
+ // the '%' when we were called.
+ if (!append_next_char)
+ output->push_back(output->at(last_invalid_percent_index + 1));
+ output->set(last_invalid_percent_index + 1, '2');
+ output->set(last_invalid_percent_index + 2, '5');
+ output->push_back(last_unescaped_char);
+ } else if (append_next_char) {
+ // Not a valid escape sequence, but we still need to undo appending the next
+ // source character so the caller can process it normally.
+ output->set_length(length);
+ }
+}
+
+// Appends the given path to the output. It assumes that if the input path
+// starts with a slash, it should be copied to the output. If no path has
+// already been appended to the output (the case when not resolving
+// relative URLs), the path should begin with a slash.
+//
+// If there are already path components (this mode is used when appending
+// relative paths for resolving), it assumes that the output already has
+// a trailing slash and that if the input begins with a slash, it should be
+// copied to the output.
+//
+// We do not collapse multiple slashes in a row to a single slash. It seems
+// no web browsers do this, and we don't want incompatibilities, even though
+// it would be correct for most systems.
+template<typename CHAR, typename UCHAR>
+bool DoPartialPath(const CHAR* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ int end = path.end();
+
+ // We use this variable to minimize the amount of work done when unescaping --
+ // we'll only call CheckForNestedEscapes() when this points at one of the last
+ // couple of characters in |output|.
+ int last_invalid_percent_index = INT_MIN;
+
+ bool success = true;
+ for (int i = path.begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(spec[i]);
+ if (sizeof(CHAR) > 1 && uch >= 0x80) {
+ // We only need to test wide input for having non-ASCII characters. For
+ // narrow input, we'll always just use the lookup table. We don't try to
+ // do anything tricky with decoding/validating UTF-8. This function will
+ // read one or two UTF-16 characters and append the output as UTF-8. This
+ // call will be removed in 8-bit mode.
+ success &= AppendUTF8EscapedChar(spec, &i, end, output);
+ } else {
+ // Normal ASCII character or 8-bit input, use the lookup table.
+ unsigned char out_ch = static_cast<unsigned char>(uch);
+ unsigned char flags = kPathCharLookup[out_ch];
+ if (flags & SPECIAL) {
+ // Needs special handling of some sort.
+ int dotlen;
+ if ((dotlen = IsDot(spec, i, end)) > 0) {
+ // See if this dot was preceded by a slash in the output. We
+ // assume that when canonicalizing paths, they will always
+ // start with a slash and not a dot, so we don't have to
+ // bounds check the output.
+ //
+ // Note that we check this in the case of dots so we don't have to
+ // special case slashes. Since slashes are much more common than
+ // dots, this actually increases performance measurably (though
+ // slightly).
+ GURL_DCHECK(output->length() > path_begin_in_output);
+ if (output->length() > path_begin_in_output &&
+ output->at(output->length() - 1) == '/') {
+ // Slash followed by a dot, check to see if this is means relative
+ int consumed_len;
+ switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
+ &consumed_len)) {
+ case NOT_A_DIRECTORY:
+ // Copy the dot to the output, it means nothing special.
+ output->push_back('.');
+ i += dotlen - 1;
+ break;
+ case DIRECTORY_CUR: // Current directory, just skip the input.
+ i += dotlen + consumed_len - 1;
+ break;
+ case DIRECTORY_UP:
+ BackUpToPreviousSlash(path_begin_in_output, output);
+ i += dotlen + consumed_len - 1;
+ break;
+ }
+ } else {
+ // This dot is not preceded by a slash, it is just part of some
+ // file name.
+ output->push_back('.');
+ i += dotlen - 1;
+ }
+
+ } else if (out_ch == '\\') {
+ // Convert backslashes to forward slashes
+ output->push_back('/');
+
+ } else if (out_ch == '%') {
+ // Handle escape sequences.
+ unsigned char unescaped_value;
+ if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
+ // Valid escape sequence, see if we keep, reject, or unescape it.
+ // Note that at this point DecodeEscape() will have advanced |i| to
+ // the last character of the escape sequence.
+ char unescaped_flags = kPathCharLookup[unescaped_value];
+
+ if (unescaped_flags & UNESCAPE) {
+ // This escaped value shouldn't be escaped. Try to copy it.
+ output->push_back(unescaped_value);
+ // If we just unescaped a value within 2 output characters of the
+ // '%' from a previously-detected invalid escape sequence, we
+ // might have an input string with problematic nested escape
+ // sequences; detect and fix them.
+ if (last_invalid_percent_index >= (output->length() - 3)) {
+ CheckForNestedEscapes(spec, i + 1, end,
+ last_invalid_percent_index, output);
+ }
+ } else {
+ // Either this is an invalid escaped character, or it's a valid
+ // escaped character we should keep escaped. In the first case we
+ // should just copy it exactly and remember the error. In the
+ // second we also copy exactly in case the server is sensitive to
+ // changing the case of any hex letters.
+ output->push_back('%');
+ output->push_back(static_cast<char>(spec[i - 1]));
+ output->push_back(static_cast<char>(spec[i]));
+ if (unescaped_flags & INVALID_BIT)
+ success = false;
+ }
+ } else {
+ // Invalid escape sequence. IE7+ rejects any URLs with such
+ // sequences, while other browsers pass them through unchanged. We
+ // use the permissive behavior.
+ // TODO(brettw): Consider testing IE's strict behavior, which would
+ // allow removing the code to handle nested escapes above.
+ last_invalid_percent_index = output->length();
+ output->push_back('%');
+ }
+
+ } else if (flags & INVALID_BIT) {
+ // For NULLs, etc. fail.
+ AppendEscapedChar(out_ch, output);
+ success = false;
+
+ } else if (flags & ESCAPE_BIT) {
+ // This character should be escaped.
+ AppendEscapedChar(out_ch, output);
+ }
+ } else {
+ // Nothing special about this character, just append it.
+ output->push_back(out_ch);
+ }
+ }
+ }
+ return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoPath(const CHAR* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ bool success = true;
+ out_path->begin = output->length();
+ if (path.len > 0) {
+ // Write out an initial slash if the input has none. If we just parse a URL
+ // and then canonicalize it, it will of course have a slash already. This
+ // check is for the replacement and relative URL resolving cases of file
+ // URLs.
+ if (!IsURLSlash(spec[path.begin]))
+ output->push_back('/');
+
+ success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
+ } else {
+ // No input, canonical path is a slash.
+ output->push_back('/');
+ }
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizePath(const char* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoPath<char, unsigned char>(spec, path, output, out_path);
+}
+
+bool CanonicalizePath(const gurl_base::char16* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ return DoPath<gurl_base::char16, gurl_base::char16>(spec, path, output, out_path);
+}
+
+bool CanonicalizePartialPath(const char* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
+ output);
+}
+
+bool CanonicalizePartialPath(const gurl_base::char16* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPath<gurl_base::char16, gurl_base::char16>(spec, path,
+ path_begin_in_output,
+ output);
+}
+
+} // namespace url
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
new file mode 100644
index 0000000..62fe22f
--- /dev/null
+++ b/url/url_canon_pathurl.cc
@@ -0,0 +1,122 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions for canonicalizing "path" URLs. Not to be confused with the path
+// of a URL, these are URLs that have no authority section, only a path. For
+// example, "javascript:" and "data:".
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace {
+
+// Canonicalize the given |component| from |source| into |output| and
+// |new_component|. If |separator| is non-zero, it is pre-pended to |output|
+// prior to the canonicalized component; i.e. for the '?' or '#' characters.
+template <typename CHAR, typename UCHAR>
+void DoCanonicalizePathComponent(const CHAR* source,
+ const Component& component,
+ char separator,
+ CanonOutput* output,
+ Component* new_component) {
+ if (component.is_valid()) {
+ if (separator)
+ output->push_back(separator);
+ // Copy the path using path URL's more lax escaping rules (think for
+ // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+ // ASCII characters alone. This helps readability of JavaStript.
+ new_component->begin = output->length();
+ int end = component.end();
+ for (int i = component.begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(source[i]);
+ if (uch < 0x20 || uch >= 0x80)
+ AppendUTF8EscapedChar(source, &i, end, output);
+ else
+ output->push_back(static_cast<char>(uch));
+ }
+ new_component->len = output->length() - new_component->begin;
+ } else {
+ // Empty part.
+ new_component->reset();
+ }
+}
+
+template <typename CHAR, typename UCHAR>
+bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // Scheme: this will append the colon.
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+ output, &new_parsed->scheme);
+
+ // We assume there's no authority for path URLs. Note that hosts should never
+ // have -1 length.
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->host.reset();
+ new_parsed->port.reset();
+ // We allow path URLs to have the path, query and fragment components, but we
+ // will canonicalize each of the via the weaker path URL rules.
+ //
+ // Note: parsing the path part should never cause a failure, see
+ // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
+ DoCanonicalizePathComponent<CHAR, UCHAR>(source.path, parsed.path, '\0',
+ output, &new_parsed->path);
+ DoCanonicalizePathComponent<CHAR, UCHAR>(source.query, parsed.query, '?',
+ output, &new_parsed->query);
+ DoCanonicalizePathComponent<CHAR, UCHAR>(source.ref, parsed.ref, '#', output,
+ &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizePathURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizePathURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizePathURL<gurl_base::char16, gurl_base::char16>(
+ URLComponentSource<gurl_base::char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizePathURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizePathURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_query.cc b/url/url_canon_query.cc
new file mode 100644
index 0000000..99b8ed8
--- /dev/null
+++ b/url/url_canon_query.cc
@@ -0,0 +1,164 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+
+// Query canonicalization in IE
+// ----------------------------
+// IE is very permissive for query parameters specified in links on the page
+// (in contrast to links that it constructs itself based on form data). It does
+// not unescape any character. It does not reject any escape sequence (be they
+// invalid like "%2y" or freaky like %00).
+//
+// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
+// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
+// layer since they are removed from all portions of the URL). All other
+// characters are passed unmodified. Invalid UTF-16 sequences are preserved as
+// well, with each character in the input being converted to UTF-8. It is the
+// server's job to make sense of this invalid query.
+//
+// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
+// are converted to the invalid character and sent as unescaped UTF-8 (0xef,
+// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
+// strings before the URL handler ever sees them.
+//
+// Our query canonicalization
+// --------------------------
+// We escape all non-ASCII characters and control characters, like Firefox.
+// This is more conformant to the URL spec, and there do not seem to be many
+// problems relating to Firefox's behavior.
+//
+// Like IE, we will never unescape (although the application may want to try
+// unescaping to present the user with a more understandable URL). We will
+// replace all invalid sequences (including invalid UTF-16 sequences, which IE
+// doesn't) with the "invalid character," and we will escape it.
+
+namespace url {
+
+namespace {
+
+// Returns true if the characters starting at |begin| and going until |end|
+// (non-inclusive) are all representable in 7-bits.
+template<typename CHAR, typename UCHAR>
+bool IsAllASCII(const CHAR* spec, const Component& query) {
+ int end = query.end();
+ for (int i = query.begin; i < end; i++) {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ return false;
+ }
+ return true;
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes. This version will accept 8 or 16
+// bit characters, but assumes that they have only 7-bit values. It also assumes
+// that all UTF-8 values are correct, so doesn't bother checking
+template<typename CHAR>
+void AppendRaw8BitQueryString(const CHAR* source, int length,
+ CanonOutput* output) {
+ for (int i = 0; i < length; i++) {
+ if (!IsQueryChar(static_cast<unsigned char>(source[i])))
+ AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
+ else // Doesn't need escaping.
+ output->push_back(static_cast<char>(source[i]));
+ }
+}
+
+// Runs the converter on the given UTF-8 input. Since the converter expects
+// UTF-16, we have to convert first. The converter must be non-NULL.
+void RunConverter(const char* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ // This function will replace any misencoded values with the invalid
+ // character. This is what we want so we don't have to check for error.
+ RawCanonOutputW<1024> utf16;
+ ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
+ converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
+}
+
+// Runs the converter with the given UTF-16 input. We don't have to do
+// anything, but this overridden function allows us to use the same code
+// for both UTF-8 and UTF-16 input.
+void RunConverter(const gurl_base::char16* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
+}
+
+template<typename CHAR, typename UCHAR>
+void DoConvertToQueryEncoding(const CHAR* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
+ // Easy: the input can just appended with no character set conversions.
+ AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
+
+ } else {
+ // Harder: convert to the proper encoding first.
+ if (converter) {
+ // Run the converter to get an 8-bit string, then append it, escaping
+ // necessary values.
+ RawCanonOutput<1024> eight_bit;
+ RunConverter(spec, query, converter, &eight_bit);
+ AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
+
+ } else {
+ // No converter, do our own UTF-8 conversion.
+ AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
+ }
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeQuery(const CHAR* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query) {
+ if (query.len < 0) {
+ *out_query = Component();
+ return;
+ }
+
+ output->push_back('?');
+ out_query->begin = output->length();
+
+ DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
+
+ out_query->len = output->length() - out_query->begin;
+}
+
+} // namespace
+
+void CanonicalizeQuery(const char* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query) {
+ DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
+ output, out_query);
+}
+
+void CanonicalizeQuery(const gurl_base::char16* spec,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ Component* out_query) {
+ DoCanonicalizeQuery<gurl_base::char16, gurl_base::char16>(spec, query, converter,
+ output, out_query);
+}
+
+void ConvertUTF16ToQueryEncoding(const gurl_base::char16* input,
+ const Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ DoConvertToQueryEncoding<gurl_base::char16, gurl_base::char16>(input, query,
+ converter, output);
+}
+
+} // namespace url
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
new file mode 100644
index 0000000..47668f6
--- /dev/null
+++ b/url/url_canon_relative.cc
@@ -0,0 +1,589 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Canonicalizer functions for working with and resolving relative URLs.
+
+#include <algorithm>
+
+#include "polyfills/base/logging.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_constants.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+#include "url/url_util.h"
+#include "url/url_util_internal.h"
+
+namespace url {
+
+namespace {
+
+// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
+// 379034), whereas IE is case-insensitive.
+//
+// We choose to be more permissive like IE. We don't need to worry about
+// unescaping or anything here: neither IE or Firefox allow this. We also
+// don't have to worry about invalid scheme characters since we are comparing
+// against the canonical scheme of the base.
+//
+// The base URL should always be canonical, therefore it should be ASCII.
+template<typename CHAR>
+bool AreSchemesEqual(const char* base,
+ const Component& base_scheme,
+ const CHAR* cmp,
+ const Component& cmp_scheme) {
+ if (base_scheme.len != cmp_scheme.len)
+ return false;
+ for (int i = 0; i < base_scheme.len; i++) {
+ // We assume the base is already canonical, so we don't have to
+ // canonicalize it.
+ if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
+ base[base_scheme.begin + i])
+ return false;
+ }
+ return true;
+}
+
+#ifdef WIN32
+
+// Here, we also allow Windows paths to be represented as "/C:/" so we can be
+// consistent about URL paths beginning with slashes. This function is like
+// DoesBeginWindowsDrivePath except that it also requires a slash at the
+// beginning.
+template<typename CHAR>
+bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
+ int spec_len) {
+ if (start_offset >= spec_len)
+ return false;
+ return IsURLSlash(spec[start_offset]) &&
+ DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
+}
+
+#endif // WIN32
+
+// See IsRelativeURL in the header file for usage.
+template<typename CHAR>
+bool DoIsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const CHAR* url,
+ int url_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component) {
+ *is_relative = false; // So we can default later to not relative.
+
+ // Trim whitespace and construct a new range for the substring.
+ int begin = 0;
+ TrimURL(url, &begin, &url_len);
+ if (begin >= url_len) {
+ // Empty URLs are relative, but do nothing.
+ if (!is_base_hierarchical) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
+ return false;
+ }
+ *relative_component = Component(begin, 0);
+ *is_relative = true;
+ return true;
+ }
+
+#ifdef WIN32
+ // We special case paths like "C:\foo" so they can link directly to the
+ // file on Windows (IE compatibility). The security domain stuff should
+ // prevent a link like this from actually being followed if its on a
+ // web page.
+ //
+ // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
+ // as relative, as this will just replace the path when the base scheme
+ // is a file and the answer will still be correct.
+ //
+ // We require strict backslashes when detecting UNC since two forward
+ // slashes should be treated a a relative URL with a hostname.
+ if (DoesBeginWindowsDriveSpec(url, begin, url_len) ||
+ DoesBeginUNCPath(url, begin, url_len, true))
+ return true;
+#endif // WIN32
+
+ // See if we've got a scheme, if not, we know this is a relative URL.
+ // BUT, just because we have a scheme, doesn't make it absolute.
+ // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
+ // empty, we treat it as relative (":foo"), like IE does.
+ Component scheme;
+ const bool scheme_is_empty =
+ !ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
+ if (scheme_is_empty) {
+ if (url[begin] == '#') {
+ // |url| is a bare fragment (e.g. "#foo"). This can be resolved against
+ // any base. Fall-through.
+ } else if (!is_base_hierarchical) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
+ return false;
+ }
+
+ *relative_component = MakeRange(begin, url_len);
+ *is_relative = true;
+ return true;
+ }
+
+ // If the scheme isn't valid, then it's relative.
+ int scheme_end = scheme.end();
+ for (int i = scheme.begin; i < scheme_end; i++) {
+ if (!CanonicalSchemeChar(url[i])) {
+ if (!is_base_hierarchical) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
+ return false;
+ }
+ *relative_component = MakeRange(begin, url_len);
+ *is_relative = true;
+ return true;
+ }
+ }
+
+ // If the scheme is not the same, then we can't count it as relative.
+ if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
+ return true;
+
+ // When the scheme that they both share is not hierarchical, treat the
+ // incoming scheme as absolute (this way with the base of "data:foo",
+ // "data:bar" will be reported as absolute.
+ if (!is_base_hierarchical)
+ return true;
+
+ int colon_offset = scheme.end();
+
+ // If it's a filesystem URL, the only valid way to make it relative is not to
+ // supply a scheme. There's no equivalent to e.g. http:index.html.
+ if (CompareSchemeComponent(url, scheme, kFileSystemScheme))
+ return true;
+
+ // ExtractScheme guarantees that the colon immediately follows what it
+ // considers to be the scheme. CountConsecutiveSlashes will handle the
+ // case where the begin offset is the end of the input.
+ int num_slashes = CountConsecutiveSlashes(url, colon_offset + 1, url_len);
+
+ if (num_slashes == 0 || num_slashes == 1) {
+ // No slashes means it's a relative path like "http:foo.html". One slash
+ // is an absolute path. "http:/home/foo.html"
+ *is_relative = true;
+ *relative_component = MakeRange(colon_offset + 1, url_len);
+ return true;
+ }
+
+ // Two or more slashes after the scheme we treat as absolute.
+ return true;
+}
+
+// Copies all characters in the range [begin, end) of |spec| to the output,
+// up until and including the last slash. There should be a slash in the
+// range, if not, nothing will be copied.
+//
+// For stardard URLs the input should be canonical, but when resolving relative
+// URLs on a non-standard base (like "data:") the input can be anything.
+void CopyToLastSlash(const char* spec,
+ int begin,
+ int end,
+ CanonOutput* output) {
+ // Find the last slash.
+ int last_slash = -1;
+ for (int i = end - 1; i >= begin; i--) {
+ if (spec[i] == '/' || spec[i] == '\\') {
+ last_slash = i;
+ break;
+ }
+ }
+ if (last_slash < 0)
+ return; // No slash.
+
+ // Copy.
+ for (int i = begin; i <= last_slash; i++)
+ output->push_back(spec[i]);
+}
+
+// Copies a single component from the source to the output. This is used
+// when resolving relative URLs and a given component is unchanged. Since the
+// source should already be canonical, we don't have to do anything special,
+// and the input is ASCII.
+void CopyOneComponent(const char* source,
+ const Component& source_component,
+ CanonOutput* output,
+ Component* output_component) {
+ if (source_component.len < 0) {
+ // This component is not present.
+ *output_component = Component();
+ return;
+ }
+
+ output_component->begin = output->length();
+ int source_end = source_component.end();
+ for (int i = source_component.begin; i < source_end; i++)
+ output->push_back(source[i]);
+ output_component->len = output->length() - output_component->begin;
+}
+
+#ifdef WIN32
+
+// Called on Windows when the base URL is a file URL, this will copy the "C:"
+// to the output, if there is a drive letter and if that drive letter is not
+// being overridden by the relative URL. Otherwise, do nothing.
+//
+// It will return the index of the beginning of the next character in the
+// base to be processed: if there is a "C:", the slash after it, or if
+// there is no drive letter, the slash at the beginning of the path, or
+// the end of the base. This can be used as the starting offset for further
+// path processing.
+template<typename CHAR>
+int CopyBaseDriveSpecIfNecessary(const char* base_url,
+ int base_path_begin,
+ int base_path_end,
+ const CHAR* relative_url,
+ int path_start,
+ int relative_url_len,
+ CanonOutput* output) {
+ if (base_path_begin >= base_path_end)
+ return base_path_begin; // No path.
+
+ // If the relative begins with a drive spec, don't do anything. The existing
+ // drive spec in the base will be replaced.
+ if (DoesBeginWindowsDriveSpec(relative_url, path_start, relative_url_len)) {
+ return base_path_begin; // Relative URL path is "C:/foo"
+ }
+
+ // The path should begin with a slash (as all canonical paths do). We check
+ // if it is followed by a drive letter and copy it.
+ if (DoesBeginSlashWindowsDriveSpec(base_url,
+ base_path_begin,
+ base_path_end)) {
+ // Copy the two-character drive spec to the output. It will now look like
+ // "file:///C:" so the rest of it can be treated like a standard path.
+ output->push_back('/');
+ output->push_back(base_url[base_path_begin + 1]);
+ output->push_back(base_url[base_path_begin + 2]);
+ return base_path_begin + 3;
+ }
+
+ return base_path_begin;
+}
+
+#endif // WIN32
+
+// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
+// the input is a relative path or less (query or ref).
+template<typename CHAR>
+bool DoResolveRelativePath(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const CHAR* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ bool success = true;
+
+ // We know the authority section didn't change, copy it to the output. We
+ // also know we have a path so can copy up to there.
+ Component path, query, ref;
+ ParsePathInternal(relative_url, relative_component, &path, &query, &ref);
+
+ // Canonical URLs always have a path, so we can use that offset. Reserve
+ // enough room for the base URL, the new path, and some extra bytes for
+ // possible escaped characters.
+ output->ReserveSizeIfNeeded(
+ base_parsed.path.begin +
+ std::max(path.end(), std::max(query.end(), ref.end())));
+ output->Append(base_url, base_parsed.path.begin);
+
+ if (path.len > 0) {
+ // The path is replaced or modified.
+ int true_path_begin = output->length();
+
+ // For file: URLs on Windows, we don't want to treat the drive letter and
+ // colon as part of the path for relative file resolution when the
+ // incoming URL does not provide a drive spec. We save the true path
+ // beginning so we can fix it up after we are done.
+ int base_path_begin = base_parsed.path.begin;
+#ifdef WIN32
+ if (base_is_file) {
+ base_path_begin = CopyBaseDriveSpecIfNecessary(
+ base_url, base_parsed.path.begin, base_parsed.path.end(),
+ relative_url, relative_component.begin, relative_component.end(),
+ output);
+ // Now the output looks like either "file://" or "file:///C:"
+ // and we can start appending the rest of the path. |base_path_begin|
+ // points to the character in the base that comes next.
+ }
+#endif // WIN32
+
+ if (IsURLSlash(relative_url[path.begin])) {
+ // Easy case: the path is an absolute path on the server, so we can
+ // just replace everything from the path on with the new versions.
+ // Since the input should be canonical hierarchical URL, we should
+ // always have a path.
+ success &= CanonicalizePath(relative_url, path,
+ output, &out_parsed->path);
+ } else {
+ // Relative path, replace the query, and reference. We take the
+ // original path with the file part stripped, and append the new path.
+ // The canonicalizer will take care of resolving ".." and "."
+ int path_begin = output->length();
+ CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
+ output);
+ success &= CanonicalizePartialPath(relative_url, path, path_begin,
+ output);
+ out_parsed->path = MakeRange(path_begin, output->length());
+
+ // Copy the rest of the stuff after the path from the relative path.
+ }
+
+ // Finish with the query and reference part (these can't fail).
+ CanonicalizeQuery(relative_url, query, query_converter,
+ output, &out_parsed->query);
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+
+ // Fix the path beginning to add back the "C:" we may have written above.
+ out_parsed->path = MakeRange(true_path_begin, out_parsed->path.end());
+ return success;
+ }
+
+ // If we get here, the path is unchanged: copy to output.
+ CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
+
+ if (query.is_valid()) {
+ // Just the query specified, replace the query and reference (ignore
+ // failures for refs)
+ CanonicalizeQuery(relative_url, query, query_converter,
+ output, &out_parsed->query);
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+ return success;
+ }
+
+ // If we get here, the query is unchanged: copy to output. Note that the
+ // range of the query parameter doesn't include the question mark, so we
+ // have to add it manually if there is a component.
+ if (base_parsed.query.is_valid())
+ output->push_back('?');
+ CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
+
+ if (ref.is_valid()) {
+ // Just the reference specified: replace it (ignoring failures).
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+ return success;
+ }
+
+ // We should always have something to do in this function, the caller checks
+ // that some component is being replaced.
+ GURL_DCHECK(false) << "Not reached";
+ return success;
+}
+
+// Resolves a relative URL that contains a host. Typically, these will
+// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
+// should be kept from the original URL is the scheme.
+template<typename CHAR>
+bool DoResolveRelativeHost(const char* base_url,
+ const Parsed& base_parsed,
+ const CHAR* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ // Parse the relative URL, just like we would for anything following a
+ // scheme.
+ Parsed relative_parsed; // Everything but the scheme is valid.
+ ParseAfterScheme(relative_url, relative_component.end(),
+ relative_component.begin, &relative_parsed);
+
+ // Now we can just use the replacement function to replace all the necessary
+ // parts of the old URL with the new one.
+ Replacements<CHAR> replacements;
+ replacements.SetUsername(relative_url, relative_parsed.username);
+ replacements.SetPassword(relative_url, relative_parsed.password);
+ replacements.SetHost(relative_url, relative_parsed.host);
+ replacements.SetPort(relative_url, relative_parsed.port);
+ replacements.SetPath(relative_url, relative_parsed.path);
+ replacements.SetQuery(relative_url, relative_parsed.query);
+ replacements.SetRef(relative_url, relative_parsed.ref);
+
+ // Length() does not include the old scheme, so make sure to add it from the
+ // base URL.
+ output->ReserveSizeIfNeeded(
+ replacements.components().Length() +
+ base_parsed.CountCharactersBefore(Parsed::USERNAME, false));
+ SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ if (!GetStandardSchemeType(base_url, base_parsed.scheme, &scheme_type)) {
+ // A path with an authority section gets canonicalized under standard URL
+ // rules, even though the base was not known to be standard.
+ scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ }
+ return ReplaceStandardURL(base_url, base_parsed, replacements, scheme_type,
+ query_converter, output, out_parsed);
+}
+
+// Resolves a relative URL that happens to be an absolute file path. Examples
+// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
+template<typename CHAR>
+bool DoResolveAbsoluteFile(const CHAR* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ // Parse the file URL. The file URl parsing function uses the same logic
+ // as we do for determining if the file is absolute, in which case it will
+ // not bother to look for a scheme.
+ Parsed relative_parsed;
+ ParseFileURL(&relative_url[relative_component.begin], relative_component.len,
+ &relative_parsed);
+
+ return CanonicalizeFileURL(&relative_url[relative_component.begin],
+ relative_component.len, relative_parsed,
+ query_converter, output, out_parsed);
+}
+
+// TODO(brettw) treat two slashes as root like Mozilla for FTP?
+template<typename CHAR>
+bool DoResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const CHAR* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ // |base_parsed| is the starting point for our output. Since we may have
+ // removed whitespace from |relative_url| before entering this method, we'll
+ // carry over the |potentially_dangling_markup| flag.
+ bool potentially_dangling_markup = out_parsed->potentially_dangling_markup;
+ *out_parsed = base_parsed;
+ if (potentially_dangling_markup)
+ out_parsed->potentially_dangling_markup = true;
+
+ // Sanity check: the input should have a host or we'll break badly below.
+ // We can only resolve relative URLs with base URLs that have hosts and
+ // paths (even the default path of "/" is OK).
+ //
+ // We allow hosts with no length so we can handle file URLs, for example.
+ if (base_parsed.path.len <= 0) {
+ // On error, return the input (resolving a relative URL on a non-relative
+ // base = the base).
+ int base_len = base_parsed.Length();
+ for (int i = 0; i < base_len; i++)
+ output->push_back(base_url[i]);
+ return false;
+ }
+
+ if (relative_component.len <= 0) {
+ // Empty relative URL, leave unchanged, only removing the ref component.
+ int base_len = base_parsed.Length();
+ base_len -= base_parsed.ref.len + 1;
+ out_parsed->ref.reset();
+ output->Append(base_url, base_len);
+ return true;
+ }
+
+ int num_slashes = CountConsecutiveSlashes(
+ relative_url, relative_component.begin, relative_component.end());
+
+#ifdef WIN32
+ // On Windows, two slashes for a file path (regardless of which direction
+ // they are) means that it's UNC. Two backslashes on any base scheme mean
+ // that it's an absolute UNC path (we use the base_is_file flag to control
+ // how strict the UNC finder is).
+ //
+ // We also allow Windows absolute drive specs on any scheme (for example
+ // "c:\foo") like IE does. There must be no preceding slashes in this
+ // case (we reject anything like "/c:/foo") because that should be treated
+ // as a path. For file URLs, we allow any number of slashes since that would
+ // be setting the path.
+ //
+ // This assumes the absolute path resolver handles absolute URLs like this
+ // properly. DoCanonicalize does this.
+ int after_slashes = relative_component.begin + num_slashes;
+ if (DoesBeginUNCPath(relative_url, relative_component.begin,
+ relative_component.end(), !base_is_file) ||
+ ((num_slashes == 0 || base_is_file) &&
+ DoesBeginWindowsDriveSpec(
+ relative_url, after_slashes, relative_component.end()))) {
+ return DoResolveAbsoluteFile(relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+#else
+ // Other platforms need explicit handling for file: URLs with multiple
+ // slashes because the generic scheme parsing always extracts a host, but a
+ // file: URL only has a host if it has exactly 2 slashes. Even if it does
+ // have a host, we want to use the special host detection logic for file
+ // URLs provided by DoResolveAbsoluteFile(), as opposed to the generic host
+ // detection logic, for consistency with parsing file URLs from scratch.
+ // This also handles the special case where the URL is only slashes,
+ // since that doesn't have a host part either.
+ if (base_is_file &&
+ (num_slashes >= 2 || num_slashes == relative_component.len)) {
+ return DoResolveAbsoluteFile(relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+#endif
+
+ // Any other double-slashes mean that this is relative to the scheme.
+ if (num_slashes >= 2) {
+ return DoResolveRelativeHost(base_url, base_parsed,
+ relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+
+ // When we get here, we know that the relative URL is on the same host.
+ return DoResolveRelativePath(base_url, base_parsed, base_is_file,
+ relative_url, relative_component,
+ query_converter, output, out_parsed);
+}
+
+} // namespace
+
+bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component) {
+ return DoIsRelativeURL<char>(
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+ is_relative, relative_component);
+}
+
+bool IsRelativeURL(const char* base,
+ const Parsed& base_parsed,
+ const gurl_base::char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ Component* relative_component) {
+ return DoIsRelativeURL<gurl_base::char16>(
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+ is_relative, relative_component);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ return DoResolveRelativeURL<char>(
+ base_url, base_parsed, base_is_file, relative_url,
+ relative_component, query_converter, output, out_parsed);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+ const Parsed& base_parsed,
+ bool base_is_file,
+ const gurl_base::char16* relative_url,
+ const Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ return DoResolveRelativeURL<gurl_base::char16>(
+ base_url, base_parsed, base_is_file, relative_url,
+ relative_component, query_converter, output, out_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_stdstring.cc b/url/url_canon_stdstring.cc
new file mode 100644
index 0000000..c81a0a9
--- /dev/null
+++ b/url/url_canon_stdstring.cc
@@ -0,0 +1,31 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_canon_stdstring.h"
+
+namespace url {
+
+StdStringCanonOutput::StdStringCanonOutput(std::string* str)
+ : CanonOutput(), str_(str) {
+ cur_len_ = static_cast<int>(str_->size()); // Append to existing data.
+ buffer_ = str_->empty() ? NULL : &(*str_)[0];
+ buffer_len_ = static_cast<int>(str_->size());
+}
+
+StdStringCanonOutput::~StdStringCanonOutput() {
+ // Nothing to do, we don't own the string.
+}
+
+void StdStringCanonOutput::Complete() {
+ str_->resize(cur_len_);
+ buffer_len_ = cur_len_;
+}
+
+void StdStringCanonOutput::Resize(int sz) {
+ str_->resize(sz);
+ buffer_ = str_->empty() ? NULL : &(*str_)[0];
+ buffer_len_ = sz;
+}
+
+} // namespace url
diff --git a/url/url_canon_stdstring.h b/url/url_canon_stdstring.h
new file mode 100644
index 0000000..82ee9db
--- /dev/null
+++ b/url/url_canon_stdstring.h
@@ -0,0 +1,88 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CANON_STDSTRING_H_
+#define URL_URL_CANON_STDSTRING_H_
+
+// This header file defines a canonicalizer output method class for STL
+// strings. Because the canonicalizer tries not to be dependent on the STL,
+// we have segregated it here.
+
+#include <string>
+
+#include "base/compiler_specific.h"
+#include "polyfills/base/component_export.h"
+#include "base/macros.h"
+#include "base/strings/string_piece.h"
+#include "url/url_canon.h"
+
+namespace url {
+
+// Write into a std::string given in the constructor. This object does not own
+// the string itself, and the user must ensure that the string stays alive
+// throughout the lifetime of this object.
+//
+// The given string will be appended to; any existing data in the string will
+// be preserved.
+//
+// Note that when canonicalization is complete, the string will likely have
+// unused space at the end because we make the string very big to start out
+// with (by |initial_size|). This ends up being important because resize
+// operations are slow, and because the base class needs to write directly
+// into the buffer.
+//
+// Therefore, the user should call Complete() before using the string that
+// this class wrote into.
+class COMPONENT_EXPORT(URL) StdStringCanonOutput : public CanonOutput {
+ public:
+ StdStringCanonOutput(std::string* str);
+ ~StdStringCanonOutput() override;
+
+ // Must be called after writing has completed but before the string is used.
+ void Complete();
+
+ void Resize(int sz) override;
+
+ protected:
+ std::string* str_;
+ DISALLOW_COPY_AND_ASSIGN(StdStringCanonOutput);
+};
+
+// An extension of the Replacements class that allows the setters to use
+// StringPieces (implicitly allowing strings or char*s).
+//
+// The contents of the StringPieces are not copied and must remain valid until
+// the StringPieceReplacements object goes out of scope.
+template<typename STR>
+class StringPieceReplacements : public Replacements<typename STR::value_type> {
+ public:
+ void SetSchemeStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetScheme(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetUsernameStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetUsername(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetPasswordStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetPassword(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetHostStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetHost(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetPortStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetPort(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetPathStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetPath(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetQueryStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetQuery(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+ void SetRefStr(const gurl_base::BasicStringPiece<STR>& s) {
+ this->SetRef(s.data(), Component(0, static_cast<int>(s.length())));
+ }
+};
+
+} // namespace url
+
+#endif // URL_URL_CANON_STDSTRING_H_
diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc
new file mode 100644
index 0000000..78f7773
--- /dev/null
+++ b/url/url_canon_stdurl.cc
@@ -0,0 +1,207 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Functions to canonicalize "standard" URLs, which are ones that have an
+// authority section including a host name.
+
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_constants.h"
+
+namespace url {
+
+namespace {
+
+template <typename CHAR, typename UCHAR>
+bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
+ const Parsed& parsed,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ // Scheme: this will append the colon.
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+ output, &new_parsed->scheme);
+
+ bool scheme_supports_user_info =
+ (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION);
+ bool scheme_supports_ports =
+ (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
+ scheme_type == SCHEME_WITH_HOST_AND_PORT);
+
+ // Authority (username, password, host, port)
+ bool have_authority;
+ if ((scheme_supports_user_info &&
+ (parsed.username.is_valid() || parsed.password.is_valid())) ||
+ parsed.host.is_nonempty() ||
+ (scheme_supports_ports && parsed.port.is_valid())) {
+ have_authority = true;
+
+ // Only write the authority separators when we have a scheme.
+ if (parsed.scheme.is_valid()) {
+ output->push_back('/');
+ output->push_back('/');
+ }
+
+ // User info: the canonicalizer will handle the : and @.
+ if (scheme_supports_user_info) {
+ success &= CanonicalizeUserInfo(
+ source.username, parsed.username, source.password, parsed.password,
+ output, &new_parsed->username, &new_parsed->password);
+ } else {
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ }
+
+ success &= CanonicalizeHost(source.host, parsed.host,
+ output, &new_parsed->host);
+
+ // Host must not be empty for standard URLs.
+ if (!parsed.host.is_nonempty())
+ success = false;
+
+ // Port: the port canonicalizer will handle the colon.
+ if (scheme_supports_ports) {
+ int default_port = DefaultPortForScheme(
+ &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
+ success &= CanonicalizePort(source.port, parsed.port, default_port,
+ output, &new_parsed->port);
+ } else {
+ new_parsed->port.reset();
+ }
+ } else {
+ // No authority, clear the components.
+ have_authority = false;
+ new_parsed->host.reset();
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->port.reset();
+ success = false; // Standard URLs must have an authority.
+ }
+
+ // Path
+ if (parsed.path.is_valid()) {
+ success &= CanonicalizePath(source.path, parsed.path,
+ output, &new_parsed->path);
+ } else if (have_authority ||
+ parsed.query.is_valid() || parsed.ref.is_valid()) {
+ // When we have an empty path, make up a path when we have an authority
+ // or something following the path. The only time we allow an empty
+ // output path is when there is nothing else.
+ new_parsed->path = Component(output->length(), 1);
+ output->push_back('/');
+ } else {
+ // No path at all
+ new_parsed->path.reset();
+ }
+
+ // Query
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
+ output, &new_parsed->query);
+
+ // Ref: ignore failure for this, since the page can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+int DefaultPortForScheme(const char* scheme, int scheme_len) {
+ int default_port = PORT_UNSPECIFIED;
+ switch (scheme_len) {
+ case 4:
+ if (!strncmp(scheme, kHttpScheme, scheme_len))
+ default_port = 80;
+ break;
+ case 5:
+ if (!strncmp(scheme, kHttpsScheme, scheme_len))
+ default_port = 443;
+ break;
+ case 3:
+ if (!strncmp(scheme, kFtpScheme, scheme_len))
+ default_port = 21;
+ else if (!strncmp(scheme, kWssScheme, scheme_len))
+ default_port = 443;
+ break;
+ case 6:
+ if (!strncmp(scheme, kGopherScheme, scheme_len))
+ default_port = 70;
+ break;
+ case 2:
+ if (!strncmp(scheme, kWsScheme, scheme_len))
+ default_port = 80;
+ break;
+ }
+ return default_port;
+}
+
+bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, scheme_type, query_converter,
+ output, new_parsed);
+}
+
+bool CanonicalizeStandardURL(const gurl_base::char16* spec,
+ int spec_len,
+ const Parsed& parsed,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ return DoCanonicalizeStandardURL<gurl_base::char16, gurl_base::char16>(
+ URLComponentSource<gurl_base::char16>(spec), parsed, scheme_type,
+ query_converter, output, new_parsed);
+}
+
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
+bool ReplaceStandardURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ source, parsed, scheme_type, query_converter, output, new_parsed);
+}
+
+// For 16-bit replacements, we turn all the replacements into UTF-8 so the
+// regular code path can be used.
+bool ReplaceStandardURL(const char* base,
+ const Parsed& base_parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ SchemeType scheme_type,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ source, parsed, scheme_type, query_converter, output, new_parsed);
+}
+
+} // namespace url
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
new file mode 100644
index 0000000..9d1a458
--- /dev/null
+++ b/url/url_canon_unittest.cc
@@ -0,0 +1,2396 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <errno.h>
+#include <stddef.h>
+
+#include "base/stl_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/test/gtest_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_canon_internal.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_test_utils.h"
+
+namespace url {
+
+namespace {
+
+struct ComponentCase {
+ const char* input;
+ const char* expected;
+ Component expected_component;
+ bool expected_success;
+};
+
+// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
+// treat each input as optional, and will only try processing if non-NULL.
+// The output is always 8-bit.
+struct DualComponentCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* expected;
+ Component expected_component;
+ bool expected_success;
+};
+
+// Test cases for CanonicalizeIPAddress(). The inputs are identical to
+// DualComponentCase, but the output has extra CanonHostInfo fields.
+struct IPAddressCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* expected;
+ Component expected_component;
+
+ // CanonHostInfo fields, for verbose output.
+ CanonHostInfo::Family expected_family;
+ int expected_num_ipv4_components;
+ const char* expected_address_hex; // Two hex chars per IP address byte.
+};
+
+std::string BytesToHexString(unsigned char bytes[16], int length) {
+ EXPECT_TRUE(length == 0 || length == 4 || length == 16)
+ << "Bad IP address length: " << length;
+ std::string result;
+ for (int i = 0; i < length; ++i) {
+ result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]);
+ result.push_back(kHexCharLookup[bytes[i] & 0xf]);
+ }
+ return result;
+}
+
+struct ReplaceCase {
+ const char* base;
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ const char* port;
+ const char* path;
+ const char* query;
+ const char* ref;
+ const char* expected;
+};
+
+// Magic string used in the replacements code that tells SetupReplComp to
+// call the clear function.
+const char kDeleteComp[] = "|";
+
+// Sets up a replacement for a single component. This is given pointers to
+// the set and clear function for the component being replaced, and will
+// either set the component (if it exists) or clear it (if the replacement
+// string matches kDeleteComp).
+//
+// This template is currently used only for the 8-bit case, and the strlen
+// causes it to fail in other cases. It is left a template in case we have
+// tests for wide replacements.
+template<typename CHAR>
+void SetupReplComp(
+ void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
+ void (Replacements<CHAR>::*clear)(),
+ Replacements<CHAR>* rep,
+ const CHAR* str) {
+ if (str && str[0] == kDeleteComp[0]) {
+ (rep->*clear)();
+ } else if (str) {
+ (rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
+ }
+}
+
+} // namespace
+
+TEST(URLCanonTest, DoAppendUTF8) {
+ struct UTF8Case {
+ unsigned input;
+ const char* output;
+ } utf_cases[] = {
+ // Valid code points.
+ {0x24, "\x24"},
+ {0xA2, "\xC2\xA2"},
+ {0x20AC, "\xE2\x82\xAC"},
+ {0x24B62, "\xF0\xA4\xAD\xA2"},
+ {0x10FFFF, "\xF4\x8F\xBF\xBF"},
+ };
+ std::string out_str;
+ for (size_t i = 0; i < gurl_base::size(utf_cases); i++) {
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+ AppendUTF8Value(utf_cases[i].input, &output);
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].output, out_str);
+ }
+}
+
+TEST(URLCanonTest, DoAppendUTF8Invalid) {
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ // Invalid code point (too large).
+ EXPECT_DCHECK_DEATH({
+ AppendUTF8Value(0x110000, &output);
+ output.Complete();
+ });
+}
+
+TEST(URLCanonTest, UTF) {
+ // Low-level test that we handle reading, canonicalization, and writing
+ // UTF-8/UTF-16 strings properly.
+ struct UTFCase {
+ const char* input8;
+ const wchar_t* input16;
+ bool expected_success;
+ const char* output;
+ } utf_cases[] = {
+ // Valid canonical input should get passed through & escaped.
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
+ // Test a character that takes > 16 bits (U+10300 = old italic letter A)
+ {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
+ // Non-shortest-form UTF-8 characters are invalid. The bad bytes should
+ // each be replaced with the invalid character (EF BF DB in UTF-8).
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false,
+ "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
+ // Invalid UTF-8 sequences should be marked as invalid (the first
+ // sequence is truncated).
+ {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
+ // Character going off the end.
+ {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
+ // ...same with low surrogates with no high surrogate.
+ {nullptr, L"\xdc00", false, "%EF%BF%BD"},
+ // Test a UTF-8 encoded surrogate value is marked as invalid.
+ // ED A0 80 = U+D800
+ {"\xed\xa0\x80", NULL, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
+ // ...even when paired.
+ {"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
+ "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
+ };
+
+ std::string out_str;
+ for (size_t i = 0; i < gurl_base::size(utf_cases); i++) {
+ if (utf_cases[i].input8) {
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+
+ int input_len = static_cast<int>(strlen(utf_cases[i].input8));
+ bool success = true;
+ for (int ch = 0; ch < input_len; ch++) {
+ success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
+ &output);
+ }
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(utf_cases[i].output), out_str);
+ }
+ if (utf_cases[i].input16) {
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+
+ gurl_base::string16 input_str(
+ test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
+ int input_len = static_cast<int>(input_str.length());
+ bool success = true;
+ for (int ch = 0; ch < input_len; ch++) {
+ success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
+ &output);
+ }
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(utf_cases[i].output), out_str);
+ }
+
+ if (utf_cases[i].input8 && utf_cases[i].input16 &&
+ utf_cases[i].expected_success) {
+ // Check that the UTF-8 and UTF-16 inputs are equivalent.
+
+ // UTF-16 -> UTF-8
+ std::string input8_str(utf_cases[i].input8);
+ gurl_base::string16 input16_str(
+ test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
+ EXPECT_EQ(input8_str, gurl_base::UTF16ToUTF8(input16_str));
+
+ // UTF-8 -> UTF-16
+ EXPECT_EQ(input16_str, gurl_base::UTF8ToUTF16(input8_str));
+ }
+ }
+}
+
+TEST(URLCanonTest, Scheme) {
+ // Here, we're mostly testing that unusual characters are handled properly.
+ // The canonicalizer doesn't do any parsing or whitespace detection. It will
+ // also do its best on error, and will escape funny sequences (these won't be
+ // valid schemes and it will return error).
+ //
+ // Note that the canonicalizer will append a colon to the output to separate
+ // out the rest of the URL, which is not present in the input. We check,
+ // however, that the output range includes everything but the colon.
+ ComponentCase scheme_cases[] = {
+ {"http", "http:", Component(0, 4), true},
+ {"HTTP", "http:", Component(0, 4), true},
+ {" HTTP ", "%20http%20:", Component(0, 10), false},
+ {"htt: ", "htt%3A%20:", Component(0, 9), false},
+ {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
+ // Don't re-escape something already escaped. Note that it will
+ // "canonicalize" the 'A' to 'a', but that's OK.
+ {"ht%3Atp", "ht%3atp:", Component(0, 7), false},
+ {"", ":", Component(0, 0), false},
+ };
+
+ std::string out_str;
+
+ for (size_t i = 0; i < gurl_base::size(scheme_cases); i++) {
+ int url_len = static_cast<int>(strlen(scheme_cases[i].input));
+ Component in_comp(0, url_len);
+ Component out_comp;
+
+ out_str.clear();
+ StdStringCanonOutput output1(&out_str);
+ bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1,
+ &out_comp);
+ output1.Complete();
+
+ EXPECT_EQ(scheme_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
+ EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
+
+ // Now try the wide version.
+ out_str.clear();
+ StdStringCanonOutput output2(&out_str);
+
+ gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(scheme_cases[i].input));
+ in_comp.len = static_cast<int>(wide_input.length());
+ success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
+ &out_comp);
+ output2.Complete();
+
+ EXPECT_EQ(scheme_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
+ EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
+ }
+
+ // Test the case where the scheme is declared nonexistent, it should be
+ // converted into an empty scheme.
+ Component out_comp;
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+
+ EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
+ output.Complete();
+
+ EXPECT_EQ(std::string(":"), out_str);
+ EXPECT_EQ(0, out_comp.begin);
+ EXPECT_EQ(0, out_comp.len);
+}
+
+TEST(URLCanonTest, Host) {
+ IPAddressCase host_cases[] = {
+ // Basic canonicalization, uppercase should be converted to lowercase.
+ {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Spaces and some other characters should be escaped.
+ {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
+ // Exciting different types of spaces!
+ {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
+ // Other types of space (no-break, zero-width, zero-width-no-break) are
+ // name-prepped away to nothing.
+ {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Ideographic full stop (full-width period for Chinese, etc.) should be
+ // treated as a dot.
+ {NULL, L"www.foo\x3002" L"bar.com", "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid unicode characters should fail...
+ // ...In wide input, ICU will barf and we'll end up with the input as
+ // escaped UTF-8 (the invalid character should be replaced with the
+ // replacement character).
+ {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ // ...This is the same as previous but with with escaped.
+ {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ // Test name prepping, fullwidth input should be converted to ASCII and NOT
+ // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
+ {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
+ // Test that fullwidth escaped values are properly name-prepped,
+ // then converted or rejected.
+ // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
+ {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
+ {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ // ICU will convert weird percents into ASCII percents, but not unescape
+ // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
+ // "small percent". At this point we should be within our rights to mark
+ // anything as invalid since the URL is corrupt or malicious. The code
+ // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
+ // and kept as valid, so we validate that behavior here, but this level
+ // of fixing the input shouldn't be seen as required. "%81" is invalid.
+ {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
+ {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // See http://unicode.org/cldr/utility/idna.jsp for other
+ // examples/experiments and http://goo.gl/7yG11o
+ // for the full list of characters handled differently by
+ // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
+
+ // 4 Deviation characters are mapped/ignored in UTS 46 transitional
+ // mechansm. UTS 46, table 4 row (g).
+ // Sharp-s is mapped to 'ss' in UTS 46 and IDNA 2003.
+ // Otherwise, it'd be "xn--fuball-cta.de".
+ {"fu\xc3\x9f" "ball.de", L"fu\x00df" L"ball.de", "fussball.de",
+ Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
+ // Final-sigma (U+03C3) is mapped to regular sigma (U+03C2).
+ // Otherwise, it'd be "xn--wxaijb9b".
+ {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
+ "xn--wxaikc6b", Component(0, 12),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
+ // handling as well as in IDNA 2003.
+ {"a\xe2\x80\x8c" "b\xe2\x80\x8d" "c", L"a\x200c" L"b\x200d" L"c", "abc",
+ Component(0, 3), CanonHostInfo::NEUTRAL, -1, ""},
+ // ZWJ between Devanagari characters is still mapped away in UTS 46
+ // transitional handling. IDNA 2008 would give xn--11bo0mv54g.
+ {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
+ L"\x915\x94d\x200d\x91c", "xn--11bo0m",
+ Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
+ // However, we do allow this at the moment because we don't use
+ // STD3 rules and canonicalize full-width ASCII to ASCII.
+ {"wow\xef\xbc\x81", L"wow\xff01", "wow%21",
+ Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
+ // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
+ // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
+ {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo",
+ Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
+ // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
+ // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
+ {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
+ "%F0%AF%A1%A8%E5%A7%BB.cn",
+ Component(0, 24), CanonHostInfo::BROKEN, -1, ""},
+ // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
+ {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya",
+ Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // An already-IDNA host is not modified.
+ {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya",
+ Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // Symbol/punctuations are allowed in IDNA 2003/UTS46.
+ // Not allowed in IDNA 2008. UTS 46 table 4 row (f).
+ {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us",
+ Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
+ // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
+ // We used to allow it because we passed through unassigned code points.
+ {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
+ Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
+ // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
+ // Used to be allowed in INDA 2003.
+ {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg",
+ Component(0, 9), CanonHostInfo::BROKEN, -1, ""},
+ // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
+ // on Unicode 3.2). We did allow it in the past because we let unassigned
+ // code point pass. We continue to allow it even though it's a
+ // "punctuation and symbol" blocked in IDNA 2008.
+ // UTS 46 table 4, row (j)
+ {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com",
+ Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
+ // Maps uppercase letters to lower case letters.
+ // In IDNA 2003, it's allowed without case-folding
+ // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
+ // (added in Unicode 4.1). UTS 46 table 4 row (k)
+ {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com",
+ Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ // Maps U+FF43 (Full Width Small Letter C) to 'c'.
+ {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz",
+ Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
+ // Maps U+1D68C (Math Monospace Small C) to 'c'.
+ // U+1D68C = \xD835\xDE8C in UTF-16
+ {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
+ Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
+ // BiDi check test
+ // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
+ // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
+ {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
+ L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw",
+ Component(0, 13), CanonHostInfo::NEUTRAL, -1, ""},
+ // Disallowed in both IDNA 2003 and 2008 with BiDi check.
+ // Labels starting with a RTL character cannot end with a LTR character.
+ {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
+ "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21),
+ CanonHostInfo::BROKEN, -1, ""},
+ // Labels starting with a RTL character can end with BC=EN (European
+ // number). Disallowed in IDNA 2003 but now allowed.
+ {"\xd8\xac\xd8\xa7\xd8\xb1" "2", L"\x62c\x627\x631" L"2",
+ "xn--2-ymcov", Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ // Labels starting with a RTL character cannot have "L" characters
+ // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
+ "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21),
+ CanonHostInfo::BROKEN, -1, ""},
+ // Labels starting with a RTL character can end with BC=AN (Arabic number)
+ // Disallowed in IDNA 2003, but now allowed.
+ {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
+ "xn--mgbjq0r", Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ // Labels starting with a RTL character cannot have "L" characters
+ // even if it ends with an BC=AN (Arabic number).
+ // Disallowed in both IDNA 2003/2008.
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
+ "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26),
+ CanonHostInfo::BROKEN, -1, ""},
+ // Labels starting with a RTL character cannot mix BC=EN and BC=AN
+ {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
+ "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27),
+ CanonHostInfo::BROKEN, -1, ""},
+ // As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
+ {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com",
+ Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
+ // U+0080 is not allowed.
+ {"\xc2\x80.com", L"\x80.com", "%C2%80.com",
+ Component(0, 10), CanonHostInfo::BROKEN, -1, ""},
+ // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
+ // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
+ // UTF-8 (wide case). The output should be equivalent to the true wide
+ // character input above).
+ {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
+ L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba",
+ Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid escaped characters should fail and the percents should be
+ // escaped.
+ {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
+ CanonHostInfo::BROKEN, -1, ""},
+ // If we get an invalid character that has been escaped.
+ {"%25", L"%25", "%25", Component(0, 3),
+ CanonHostInfo::BROKEN, -1, ""},
+ {"hello%00", L"hello%00", "hello%00", Component(0, 8),
+ CanonHostInfo::BROKEN, -1, ""},
+ // Escaped numbers should be treated like IP addresses if they are.
+ {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
+ "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
+ "C0A80001"},
+ {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e",
+ "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3,
+ "C0A80001"},
+ // Invalid escaping should trigger the regular host error handling.
+ {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
+ // Something that isn't exactly an IP should get treated as a host and
+ // spaces escaped.
+ {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
+ // These are "0Xc0.0250.01" in fullwidth.
+ {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ // Broken IP addresses get marked as such.
+ {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
+ {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12), CanonHostInfo::BROKEN, -1, ""},
+ // Cyrillic letter followed by '(' should return punycode for '(' escaped
+ // before punycode string was created. I.e.
+ // if '(' is escaped after punycode is created we would get xn--%28-8tb
+ // (incorrect).
+ {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11),
+ CanonHostInfo::NEUTRAL, -1, ""},
+ // Address with all hexidecimal characters with leading number of 1<<32
+ // or greater and should return NEUTRAL rather than BROKEN if not all
+ // components are numbers.
+ {"12345678912345.de", L"12345678912345.de", "12345678912345.de", Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
+ // A label that starts with "xn--" but contains non-ASCII characters should
+ // be an error. Escape the invalid characters.
+ {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen", Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ };
+
+ // CanonicalizeHost() non-verbose.
+ std::string out_str;
+ for (size_t i = 0; i < gurl_base::size(host_cases); i++) {
+ // Narrow version.
+ if (host_cases[i].input8) {
+ int host_len = static_cast<int>(strlen(host_cases[i].input8));
+ Component in_comp(0, host_len);
+ Component out_comp;
+
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+
+ bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+ success) << "for input: " << host_cases[i].input8;
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str) <<
+ "for input: " << host_cases[i].input8;
+ EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) <<
+ "for input: " << host_cases[i].input8;
+ EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) <<
+ "for input: " << host_cases[i].input8;
+ }
+
+ // Wide version.
+ if (host_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(host_cases[i].input16));
+ int host_len = static_cast<int>(input16.length());
+ Component in_comp(0, host_len);
+ Component out_comp;
+
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+
+ bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+ success);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
+ }
+ }
+
+ // CanonicalizeHostVerbose()
+ for (size_t i = 0; i < gurl_base::size(host_cases); i++) {
+ // Narrow version.
+ if (host_cases[i].input8) {
+ int host_len = static_cast<int>(strlen(host_cases[i].input8));
+ Component in_comp(0, host_len);
+
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+ CanonHostInfo host_info;
+
+ CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output,
+ &host_info);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+ EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+
+ // Wide version.
+ if (host_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(host_cases[i].input16));
+ int host_len = static_cast<int>(input16.length());
+ Component in_comp(0, host_len);
+
+ out_str.clear();
+ StdStringCanonOutput output(&out_str);
+ CanonHostInfo host_info;
+
+ CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+ EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+ }
+}
+
+TEST(URLCanonTest, IPv4) {
+ IPAddressCase cases[] = {
+ // Empty is not an IP address.
+ {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Regular IP addresses in different bases.
+ {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ // Non-IP addresses due to invalid characters.
+ {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid characters for the base should be rejected.
+ {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // If there are not enough components, the last one should fill them out.
+ {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
+ {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
+ {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
+ {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
+ {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ // Too many components means not an IP address.
+ {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // We allow a single trailing dot.
+ {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Two dots in a row means not an IP address.
+ {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Any numerical overflow should be marked as BROKEN.
+ {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Repeat the previous tests, minus 1, to verify boundaries.
+ {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
+ {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
+ {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
+ {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
+ {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
+ {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
+ {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
+ {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
+ {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
+ // Old trunctations tests. They're all "BROKEN" now.
+ {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Spaces should be rejected.
+ {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Very large numbers.
+ {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
+ {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
+ // A number has no length limit, but long numbers can still overflow.
+ {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
+ {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // If a long component is non-numeric, it's a hostname, *not* a broken IP.
+ {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Truncation of all zeros should still result in 0.
+ {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ // 8-bit version.
+ Component component(0, static_cast<int>(strlen(cases[i].input8)));
+
+ std::string out_str1;
+ StdStringCanonOutput output1(&out_str1);
+ CanonHostInfo host_info;
+ CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
+ output1.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV4) {
+ EXPECT_STREQ(cases[i].expected, out_str1.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+
+ // 16-bit version.
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(cases[i].input16));
+ component = Component(0, static_cast<int>(input16.length()));
+
+ std::string out_str2;
+ StdStringCanonOutput output2(&out_str2);
+ CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
+ output2.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV4) {
+ EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+}
+
+TEST(URLCanonTest, IPv6) {
+ IPAddressCase cases[] = {
+ // Empty is not an IP address.
+ {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Non-IPs with [:] characters are marked BROKEN.
+ {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Regular IP address is invalid without bounding '[' and ']'.
+ {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Regular IP addresses.
+ {"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"},
+ {"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"},
+ {"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"},
+
+ // Leading zeros should be stripped.
+ {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"},
+
+ // Upper case letters should be lowercased.
+ {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"},
+
+ // The same address can be written with different contractions, but should
+ // get canonicalized to the same thing.
+ {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
+ {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
+
+ // Addresses with embedded IPv4.
+ {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
+ {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
+ {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"},
+ {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"},
+ {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // IPv4 with last component missing.
+ {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0010002"},
+
+ // IPv4 using hex.
+ // TODO(eroman): Should this format be disallowed?
+ {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
+
+ // There may be zeros surrounding the "::" contraction.
+ {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
+
+ {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
+
+ // Can only have one "::" contraction in an IPv6 string literal.
+ {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // No more than 2 consecutive ':'s.
+ {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Non-IP addresses due to invalid characters.
+ {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // If there are not enough components, the last one should fill them out.
+ // ... omitted at this time ...
+ // Too many components means not an IP address. Similarly, with too few
+ // if using IPv4 compat or mapped addresses.
+ {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Too many bits (even though 8 comonents, the last one holds 32 bits).
+ {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Too many bits specified -- the contraction would have to be zero-length
+ // to not exceed 128 bits.
+ {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // The contraction is for 16 bits of zero.
+ {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"},
+
+ // Cannot have a trailing colon.
+ {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Cannot have negative numbers.
+ {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
+ // The scope_id should be included in the canonicalized URL, and is an
+ // unsigned decimal number.
+
+ // Invalid because no ID was given after the percent.
+
+ // Don't allow scope-id
+ {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Don't allow leading or trailing colons.
+ {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // We allow a single trailing dot.
+ // ... omitted at this time ...
+ // Two dots in a row means not an IP address.
+ {"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Any non-first components get truncated to one byte.
+ // ... omitted at this time ...
+ // Spaces should be rejected.
+ {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ // 8-bit version.
+ Component component(0, static_cast<int>(strlen(cases[i].input8)));
+
+ std::string out_str1;
+ StdStringCanonOutput output1(&out_str1);
+ CanonHostInfo host_info;
+ CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
+ output1.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8;
+ if (host_info.family == CanonHostInfo::IPV6) {
+ EXPECT_STREQ(cases[i].expected, out_str1.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ }
+
+ // 16-bit version.
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(cases[i].input16));
+ component = Component(0, static_cast<int>(input16.length()));
+
+ std::string out_str2;
+ StdStringCanonOutput output2(&out_str2);
+ CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
+ output2.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV6) {
+ EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ }
+ }
+}
+
+TEST(URLCanonTest, IPEmpty) {
+ std::string out_str1;
+ StdStringCanonOutput output1(&out_str1);
+ CanonHostInfo host_info;
+
+ // This tests tests.
+ const char spec[] = "192.168.0.1";
+ CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+
+ CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+}
+
+// Verifies that CanonicalizeHostSubstring produces the expected output and
+// does not "fix" IP addresses. Because this code is a subset of
+// CanonicalizeHost, the shared functionality is not tested.
+TEST(URLCanonTest, CanonicalizeHostSubstring) {
+ // Basic sanity check.
+ {
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
+ Component(0, 12), &output));
+ output.Complete();
+ EXPECT_EQ("xn--mnchen-3ya.com", out_str);
+ }
+
+ // Failure case.
+ {
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ EXPECT_FALSE(CanonicalizeHostSubstring(
+ test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
+ Component(0, 8), &output));
+ output.Complete();
+ EXPECT_EQ("%EF%BF%BDzyx.com", out_str);
+ }
+
+ // Should return true for empty input strings.
+ {
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
+ output.Complete();
+ EXPECT_EQ(std::string(), out_str);
+ }
+
+ // Numbers that look like IP addresses should not be changed.
+ {
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ EXPECT_TRUE(
+ CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
+ output.Complete();
+ EXPECT_EQ("01.02.03.04", out_str);
+ }
+}
+
+TEST(URLCanonTest, UserInfo) {
+ // Note that the canonicalizer should escape and treat empty components as
+ // not being there.
+
+ // We actually parse a full input URL so we can get the initial components.
+ struct UserComponentCase {
+ const char* input;
+ const char* expected;
+ Component expected_username;
+ Component expected_password;
+ bool expected_success;
+ } user_info_cases[] = {
+ {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true},
+ {"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
+ {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
+ {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
+ {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true},
+ {"http://^ :$\t@host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
+ {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
+ {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
+
+ // IE7 compatibility: old versions allowed backslashes in usernames, but
+ // IE7 does not. We disallow it as well.
+ {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(user_info_cases); i++) {
+ int url_len = static_cast<int>(strlen(user_info_cases[i].input));
+ Parsed parsed;
+ ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
+ Component out_user, out_pass;
+ std::string out_str;
+ StdStringCanonOutput output1(&out_str);
+
+ bool success = CanonicalizeUserInfo(user_info_cases[i].input,
+ parsed.username,
+ user_info_cases[i].input,
+ parsed.password,
+ &output1,
+ &out_user,
+ &out_pass);
+ output1.Complete();
+
+ EXPECT_EQ(user_info_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
+ EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
+ EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
+ EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
+ EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
+
+ // Now try the wide version
+ out_str.clear();
+ StdStringCanonOutput output2(&out_str);
+ gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(user_info_cases[i].input));
+ success = CanonicalizeUserInfo(wide_input.c_str(),
+ parsed.username,
+ wide_input.c_str(),
+ parsed.password,
+ &output2,
+ &out_user,
+ &out_pass);
+ output2.Complete();
+
+ EXPECT_EQ(user_info_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
+ EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
+ EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
+ EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
+ EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
+ }
+}
+
+TEST(URLCanonTest, Port) {
+ // We only need to test that the number gets properly put into the output
+ // buffer. The parser unit tests will test scanning the number correctly.
+ //
+ // Note that the CanonicalizePort will always prepend a colon to the output
+ // to separate it from the colon that it assumes precedes it.
+ struct PortCase {
+ const char* input;
+ int default_port;
+ const char* expected;
+ Component expected_component;
+ bool expected_success;
+ } port_cases[] = {
+ // Invalid input should be copied w/ failure.
+ {"as df", 80, ":as%20df", Component(1, 7), false},
+ {"-2", 80, ":-2", Component(1, 2), false},
+ // Default port should be omitted.
+ {"80", 80, "", Component(0, -1), true},
+ {"8080", 80, ":8080", Component(1, 4), true},
+ // PORT_UNSPECIFIED should mean always keep the port.
+ {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(port_cases); i++) {
+ int url_len = static_cast<int>(strlen(port_cases[i].input));
+ Component in_comp(0, url_len);
+ Component out_comp;
+ std::string out_str;
+ StdStringCanonOutput output1(&out_str);
+ bool success = CanonicalizePort(port_cases[i].input,
+ in_comp,
+ port_cases[i].default_port,
+ &output1,
+ &out_comp);
+ output1.Complete();
+
+ EXPECT_EQ(port_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(port_cases[i].expected), out_str);
+ EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
+
+ // Now try the wide version
+ out_str.clear();
+ StdStringCanonOutput output2(&out_str);
+ gurl_base::string16 wide_input(gurl_base::UTF8ToUTF16(port_cases[i].input));
+ success = CanonicalizePort(wide_input.c_str(),
+ in_comp,
+ port_cases[i].default_port,
+ &output2,
+ &out_comp);
+ output2.Complete();
+
+ EXPECT_EQ(port_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(port_cases[i].expected), out_str);
+ EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
+ }
+}
+
+TEST(URLCanonTest, Path) {
+ DualComponentCase path_cases[] = {
+ // ----- path collapsing tests -----
+ {"/././foo", L"/././foo", "/foo", Component(0, 4), true},
+ {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true},
+ {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true},
+ {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true},
+ // double dots followed by a slash or the end of the string count
+ {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true},
+ {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true},
+ // don't count double dots when they aren't followed by a slash
+ {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true},
+ // some in the middle
+ {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true},
+ {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", Component(0, 2), true},
+ // we should not be able to go above the root
+ {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true},
+ {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true},
+ // escaped dots should be unescaped and treated the same as dots
+ {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true},
+ {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true},
+ {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", Component(0, 6), true},
+ // Multiple slashes in a row should be preserved and treated like empty
+ // directory names.
+ {"////../..", L"////../..", "//", Component(0, 2), true},
+
+ // ----- escaping tests -----
+ {"/foo", L"/foo", "/foo", Component(0, 4), true},
+ // Valid escape sequence
+ {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true},
+ // Invalid escape sequence we should pass through unchanged.
+ {"/foo%", L"/foo%", "/foo%", Component(0, 5), true},
+ {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true},
+ // Invalid escape sequence: bad characters should be treated the same as
+ // the sourrounding text, not as escaped (in this case, UTF-8).
+ {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true},
+ {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", Component(0, 16), true},
+ {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22), true},
+ // Regular characters that are escaped should be unescaped
+ {"/foo%41%7a", L"/foo%41%7a", "/fooAz", Component(0, 6), true},
+ // Funny characters that are unescaped should be escaped
+ {"/foo\x09\x91%91", NULL, "/foo%09%91%91", Component(0, 13), true},
+ {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true},
+ // Invalid characters that are escaped should cause a failure.
+ {"/foo%00%51", L"/foo%00%51", "/foo%00Q", Component(0, 8), false},
+ // Some characters should be passed through unchanged regardless of esc.
+ {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13), true},
+ // Characters that are properly escaped should not have the case changed
+ // of hex letters.
+ {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13), true},
+ // Funny characters that are unescaped should be escaped
+ {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true},
+ // Backslashes should get converted to forward slashes
+ {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", Component(0, 8), true},
+ // Hashes found in paths (possibly only when the caller explicitly sets
+ // the path on an already-parsed URL) should be escaped.
+ {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true},
+ // %7f should be allowed and %3D should not be unescaped (these were wrong
+ // in a previous version).
+ {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
+ // @ should be passed through unchanged (escaped or unescaped).
+ {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
+ // Nested escape sequences should result in escaping the leading '%' if
+ // unescaping would result in a new escape sequence.
+ {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true},
+ {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true},
+ {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true},
+ // Make sure truncated "nested" escapes don't result in reading off the
+ // string end.
+ {"/%%41", L"/%%41", "/%A", Component(0, 3), true},
+ // Don't unescape the leading '%' if unescaping doesn't result in a valid
+ // new escape sequence.
+ {"/%%470", L"/%%470", "/%G0", Component(0, 4), true},
+ {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true},
+ // Don't erroneously downcast a UTF-16 charater in a way that makes it
+ // look like part of an escape sequence.
+ {NULL, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true},
+
+ // ----- encoding tests -----
+ // Basic conversions
+ {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", Component(0, 37), true},
+ // Invalid unicode characters should fail. We only do validation on
+ // UTF-16 input, so this doesn't happen on 8-bit.
+ {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", Component(0, 13), true},
+ {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", Component(0, 13), false},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(path_cases); i++) {
+ if (path_cases[i].input8) {
+ int len = static_cast<int>(strlen(path_cases[i].input8));
+ Component in_comp(0, len);
+ Component out_comp;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success =
+ CanonicalizePath(path_cases[i].input8, in_comp, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(path_cases[i].expected_success, success);
+ EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+ }
+
+ if (path_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(path_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ Component in_comp(0, len);
+ Component out_comp;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+
+ bool success =
+ CanonicalizePath(input16.c_str(), in_comp, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(path_cases[i].expected_success, success);
+ EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+ }
+ }
+
+ // Manual test: embedded NULLs should be escaped and the URL should be marked
+ // as invalid.
+ const char path_with_null[] = "/ab\0c";
+ Component in_comp(0, 5);
+ Component out_comp;
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizePath(path_with_null, in_comp, &output, &out_comp);
+ output.Complete();
+ EXPECT_FALSE(success);
+ EXPECT_EQ("/ab%00c", out_str);
+}
+
+TEST(URLCanonTest, Query) {
+ struct QueryCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* expected;
+ } query_cases[] = {
+ // Regular ASCII case.
+ {"foo=bar", L"foo=bar", "?foo=bar"},
+ // Allow question marks in the query without escaping
+ {"as?df", L"as?df", "?as?df"},
+ // Always escape '#' since it would mark the ref.
+ {"as#df", L"as#df", "?as%23df"},
+ // Escape some questionable 8-bit characters, but never unescape.
+ {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"},
+ {"%40%41123", L"%40%41123", "?%40%41123"},
+ // Chinese input/output
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"},
+ // Invalid UTF-8/16 input should be replaced with invalid characters.
+ {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"},
+ // Don't allow < or > because sometimes they are used for XSS if the
+ // URL is echoed in content. Firefox does this, IE doesn't.
+ {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"},
+ // Escape double quotemarks in the query.
+ {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(query_cases); i++) {
+ Component out_comp;
+
+ if (query_cases[i].input8) {
+ int len = static_cast<int>(strlen(query_cases[i].input8));
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(query_cases[i].input8, in_comp, NULL, &output,
+ &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+
+ if (query_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(query_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ Component in_comp(0, len);
+ std::string out_str;
+
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeQuery(input16.c_str(), in_comp, NULL, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+ }
+
+ // Extra test for input with embedded NULL;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Component out_comp;
+ CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
+ output.Complete();
+ EXPECT_EQ("?a%20%00z%01", out_str);
+}
+
+TEST(URLCanonTest, Ref) {
+ // Refs are trivial, it just checks the encoding.
+ DualComponentCase ref_cases[] = {
+ {"hello!", L"hello!", "#hello!", Component(1, 6), true},
+ // We should escape spaces, double-quotes, angled braces, and backtics.
+ {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
+ true},
+ {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
+ true},
+ {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
+ true},
+ {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
+ true},
+ {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
+ true},
+ // UTF-8/wide input should be preserved
+ {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
+ // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
+ {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss",
+ Component(1, 14), true},
+ // Escaping should be preserved unchanged, even invalid ones
+ {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true},
+ // Invalid UTF-8/16 input should be flagged and the input made valid
+ {"\xc2", NULL, "#%EF%BF%BD", Component(1, 9), true},
+ {NULL, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true},
+ // Test a Unicode invalid character.
+ {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%BF%BD", Component(1, 10), true},
+ // Refs can have # signs and we should preserve them.
+ {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true},
+ {"#asdf", L"#asdf", "##asdf", Component(1, 5), true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(ref_cases); i++) {
+ // 8-bit input
+ if (ref_cases[i].input8) {
+ int len = static_cast<int>(strlen(ref_cases[i].input8));
+ Component in_comp(0, len);
+ Component out_comp;
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeRef(ref_cases[i].input8, in_comp, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(ref_cases[i].expected, out_str);
+ }
+
+ // 16-bit input
+ if (ref_cases[i].input16) {
+ gurl_base::string16 input16(
+ test_utils::TruncateWStringToUTF16(ref_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ Component in_comp(0, len);
+ Component out_comp;
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(ref_cases[i].expected, out_str);
+ }
+ }
+
+ // Try one with an embedded NULL. It should be stripped.
+ const char null_input[5] = "ab\x00z";
+ Component null_input_component(0, 4);
+ Component out_comp;
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ CanonicalizeRef(null_input, null_input_component, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(1, out_comp.begin);
+ EXPECT_EQ(3, out_comp.len);
+ EXPECT_EQ("#abz", out_str);
+}
+
+TEST(URLCanonTest, CanonicalizeStandardURL) {
+ // The individual component canonicalize tests should have caught the cases
+ // for each of those components. Here, we just need to test that the various
+ // parts are included or excluded properly, and have the correct separators.
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ } cases[] = {
+ {"http://www.google.com/foo?bar=baz#",
+ "http://www.google.com/foo?bar=baz#", true},
+ {"http://[www.google.com]/", "http://[www.google.com]/", false},
+ {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#",
+ false},
+ {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo",
+ true},
+ {"www.google.com", ":www.google.com/", false},
+ {"http://192.0x00A80001", "http://192.168.0.1/", true},
+ {"http://www/foo%2Ehtml", "http://www/foo.html", true},
+ {"http://user:pass@/", "http://user:pass@/", false},
+ {"http://%25DOMAIN:foobar@foodomain.com/",
+ "http://%25DOMAIN:foobar@foodomain.com/", true},
+
+ // Backslashes should get converted to forward slashes.
+ {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
+
+ // Busted refs shouldn't make the whole thing fail.
+ {"http://www.google.com/asdf#\xc2",
+ "http://www.google.com/asdf#%EF%BF%BD", true},
+
+ // Basic port tests.
+ {"http://foo:80/", "http://foo/", true},
+ {"http://foo:81/", "http://foo:81/", true},
+ {"httpa://foo:80/", "httpa://foo:80/", true},
+ {"http://foo:-80/", "http://foo:-80/", false},
+
+ {"https://foo:443/", "https://foo/", true},
+ {"https://foo:80/", "https://foo:80/", true},
+ {"ftp://foo:21/", "ftp://foo/", true},
+ {"ftp://foo:80/", "ftp://foo:80/", true},
+ {"gopher://foo:70/", "gopher://foo/", true},
+ {"gopher://foo:443/", "gopher://foo:443/", true},
+ {"ws://foo:80/", "ws://foo/", true},
+ {"ws://foo:81/", "ws://foo:81/", true},
+ {"ws://foo:443/", "ws://foo:443/", true},
+ {"ws://foo:815/", "ws://foo:815/", true},
+ {"wss://foo:80/", "wss://foo:80/", true},
+ {"wss://foo:81/", "wss://foo:81/", true},
+ {"wss://foo:443/", "wss://foo/", true},
+ {"wss://foo:815/", "wss://foo:815/", true},
+
+ // This particular code path ends up "backing up" to replace an invalid
+ // host ICU generated with an escaped version. Test that in the context
+ // of a full URL to make sure the backing up doesn't mess up the non-host
+ // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
+ // ICU will convert to an ASCII one, generating "%81".
+ {"ws:)W\x1eW\xef\xb9\xaa"
+ "81:80/",
+ "ws://%29w%1ew%81/", false},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ Parsed parsed;
+ ParseStandardURL(cases[i].input, url_len, &parsed);
+
+ Parsed out_parsed;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizeStandardURL(
+ cases[i].input, url_len, parsed,
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+ }
+}
+
+// The codepath here is the same as for regular canonicalization, so we just
+// need to test that things are replaced or not correctly.
+TEST(URLCanonTest, ReplaceStandardURL) {
+ ReplaceCase replace_cases[] = {
+ // Common case of truncating the path.
+ {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"},
+ // Replace everything
+ {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"},
+ // Replace nothing
+ {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
+ // Replace scheme with filesystem. The result is garbage, but you asked
+ // for it.
+ {"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ Parsed parsed;
+ ParseStandardURL(cur.base, base_len, &parsed);
+
+ Replacements<char> r;
+ typedef Replacements<char> R; // Clean up syntax.
+
+ // Note that for the scheme we pass in a different clear function since
+ // there is no function to clear the scheme.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Parsed out_parsed;
+ ReplaceStandardURL(replace_cases[i].base, parsed, r,
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
+ &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+
+ // The path pointer should be ignored if the address is invalid.
+ {
+ const char src[] = "http://www.google.com/here_is_the_path";
+ int src_len = static_cast<int>(strlen(src));
+
+ Parsed parsed;
+ ParseStandardURL(src, src_len, &parsed);
+
+ // Replace the path to 0 length string. By using 1 as the string address,
+ // the test should get an access violation if it tries to dereference it.
+ Replacements<char> r;
+ r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0));
+ std::string out_str1;
+ StdStringCanonOutput output1(&out_str1);
+ Parsed new_parsed;
+ ReplaceStandardURL(src, parsed, r,
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
+ &output1, &new_parsed);
+ output1.Complete();
+ EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
+
+ // Same with an "invalid" path.
+ r.SetPath(reinterpret_cast<char*>(0x00000001), Component());
+ std::string out_str2;
+ StdStringCanonOutput output2(&out_str2);
+ ReplaceStandardURL(src, parsed, r,
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
+ &output2, &new_parsed);
+ output2.Complete();
+ EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
+ }
+}
+
+TEST(URLCanonTest, ReplaceFileURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
+ // Replace nothing
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"},
+ // Clear non-path components (common)
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
+ // Replace path with something that doesn't begin with a slash and make
+ // sure it gets added properly.
+ {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"},
+ {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
+ // Replace scheme -- shouldn't do anything.
+ {"file:///C:/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ Parsed parsed;
+ ParseFileURL(cur.base, base_len, &parsed);
+
+ Replacements<char> r;
+ typedef Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Parsed out_parsed;
+ ReplaceFileURL(cur.base, parsed, r, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplaceFileSystemURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything in the outer URL.
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL,
+ NULL, "/foo", "b", "c", "filesystem:file:///temporary/foo?b#c"},
+ // Replace nothing
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, "filesystem:file:///temporary/gaba?query#ref"},
+ // Clear non-path components (common)
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL,
+ NULL, NULL, kDeleteComp, kDeleteComp,
+ "filesystem:file:///temporary/gaba"},
+ // Replace path with something that doesn't begin with a slash and make
+ // sure it gets added properly.
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL,
+ NULL, "interesting/", NULL, NULL,
+ "filesystem:file:///temporary/interesting/?query#ref"},
+ // Replace scheme -- shouldn't do anything except canonicalize.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL,
+ "filesystem:http://bar.com/t/gaba?query#ref"},
+ // Replace username -- shouldn't do anything except canonicalize.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, "u2", NULL, NULL,
+ NULL, NULL, NULL, NULL, "filesystem:http://bar.com/t/gaba?query#ref"},
+ // Replace password -- shouldn't do anything except canonicalize.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, "pw2",
+ NULL, NULL, NULL, NULL, NULL,
+ "filesystem:http://bar.com/t/gaba?query#ref"},
+ // Replace host -- shouldn't do anything except canonicalize.
+ {"filesystem:http://u:p@bar.com:80/t/gaba?query#ref", NULL, NULL, NULL,
+ "foo.com", NULL, NULL, NULL, NULL,
+ "filesystem:http://bar.com/t/gaba?query#ref"},
+ // Replace port -- shouldn't do anything except canonicalize.
+ {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", NULL, NULL, NULL,
+ NULL, "41", NULL, NULL, NULL,
+ "filesystem:http://bar.com:40/t/gaba?query#ref"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ Parsed parsed;
+ ParseFileSystemURL(cur.base, base_len, &parsed);
+
+ Replacements<char> r;
+ typedef Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Parsed out_parsed;
+ ReplaceFileSystemURL(cur.base, parsed, r, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplacePathURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"},
+ // Replace nothing
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"},
+ // Replace one or the other
+ {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"},
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"},
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ Parsed parsed;
+ ParsePathURL(cur.base, base_len, false, &parsed);
+
+ Replacements<char> r;
+ typedef Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Parsed out_parsed;
+ ReplacePathURL(cur.base, parsed, r, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplaceMailtoURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"},
+ // Replace nothing
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"},
+ // Replace the path
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"},
+ // Replace the query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"},
+ // Replace the path and query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"},
+ // Set the query to empty (should leave trailing question mark)
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"},
+ // Clear the query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"},
+ // Clear the path
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"},
+ // Clear the path + query
+ {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"},
+ // Setting the ref should have no effect
+ {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ Parsed parsed;
+ ParseMailtoURL(cur.base, base_len, &parsed);
+
+ Replacements<char> r;
+ typedef Replacements<char> R;
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ Parsed out_parsed;
+ ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeFileURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ Component expected_host;
+ Component expected_path;
+ } cases[] = {
+#ifdef _WIN32
+ // Windows-style paths
+ {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(),
+ Component(7, 16)},
+ {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true,
+ Component(), Component(7, 19)},
+ {"file:", "file:///", true, Component(), Component(7, 1)},
+ {"file:UNChost/path", "file://unchost/path", true, Component(7, 7),
+ Component(14, 5)},
+ // CanonicalizeFileURL supports absolute Windows style paths for IE
+ // compatibility. Note that the caller must decide that this is a file
+ // URL itself so it can call the file canonicalizer. This is usually
+ // done automatically as part of relative URL resolving.
+ {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(),
+ Component(7, 11)},
+ {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
+ {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(),
+ Component(7, 11)},
+ {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(),
+ Component(7, 11)},
+ {"//server/file", "file://server/file", true, Component(7, 6),
+ Component(13, 5)},
+ {"\\\\server\\file", "file://server/file", true, Component(7, 6),
+ Component(13, 5)},
+ {"/\\server/file", "file://server/file", true, Component(7, 6),
+ Component(13, 5)},
+ // We should preserve the number of slashes after the colon for IE
+ // compatibility, except when there is none, in which case we should
+ // add one.
+ {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(),
+ Component(7, 16)},
+ {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true,
+ Component(), Component(7, 19)},
+ // Three slashes should be non-UNC, even if there is no drive spec (IE
+ // does this, which makes the resulting request invalid).
+ {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(),
+ Component(7, 12)},
+ // TODO(brettw) we should probably fail for invalid host names, which
+ // would change the expected result on this test. We also currently allow
+ // colon even though it's probably invalid, because its currently the
+ // "natural" result of the way the canonicalizer is written. There doesn't
+ // seem to be a strong argument for why allowing it here would be bad, so
+ // we just tolerate it and the load will fail later.
+ {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false,
+ Component(7, 2), Component(9, 16)},
+ {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5),
+ Component(12, 8)},
+ // Make sure relative paths can't go above the "C:"
+ {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true,
+ Component(), Component(7, 12)},
+ // Busted refs shouldn't make the whole thing fail.
+ {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(),
+ Component(7, 8)},
+#else
+ // Unix-style paths
+ {"file:///home/me", "file:///home/me", true, Component(), Component(7, 8)},
+ // Windowsy ones should get still treated as Unix-style.
+ {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(), Component(7, 16)},
+ {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, Component(), Component(7, 19)},
+ // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
+ {"//", "file:///", true, Component(), Component(7, 1)},
+ {"///", "file:///", true, Component(), Component(7, 1)},
+ {"///test", "file:///test", true, Component(), Component(7, 5)},
+ {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)},
+ {"file://localhost", "file://localhost/", true, Component(7, 9), Component(16, 1)},
+ {"file://localhost/", "file://localhost/", true, Component(7, 9), Component(16, 1)},
+ {"file://localhost/test", "file://localhost/test", true, Component(7, 9), Component(16, 5)},
+#endif // _WIN32
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ Parsed parsed;
+ ParseFileURL(cases[i].input, url_len, &parsed);
+
+ Parsed out_parsed;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizeFileURL(cases[i].input, url_len, parsed, NULL,
+ &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified, the file canonicalizer has
+ // different code for writing the spec.
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(4, out_parsed.scheme.len);
+
+ EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin);
+ EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len);
+
+ EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
+ EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeFileSystemURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ } cases[] = {
+ {"Filesystem:htTp://www.Foo.com:80/tempoRary", "filesystem:http://www.foo.com/tempoRary/", true},
+ {"filesystem:httpS://www.foo.com/temporary/", "filesystem:https://www.foo.com/temporary/", true},
+ {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//", false},
+ {"filesystem:http://www.foo.com/persistent/bob?query#ref", "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
+ {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
+ {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
+ {"filesystem:File:///temporary/Bob?qUery#reF", "filesystem:file:///temporary/Bob?qUery#reF", true},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ Parsed parsed;
+ ParseFileSystemURL(cases[i].input, url_len, &parsed);
+
+ Parsed out_parsed;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizeFileSystemURL(cases[i].input, url_len, parsed,
+ NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified, the filesystem canonicalizer
+ // has different code for writing the spec.
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(10, out_parsed.scheme.len);
+ if (success)
+ EXPECT_GT(out_parsed.path.len, 0);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizePathURL) {
+ // Path URLs should get canonicalized schemes but nothing else.
+ struct PathCase {
+ const char* input;
+ const char* expected;
+ } path_cases[] = {
+ {"javascript:", "javascript:"},
+ {"JavaScript:Foo", "javascript:Foo"},
+ {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"},
+
+ // Validation errors should not cause failure. See
+ // https://crbug.com/925614.
+ {"javascript:\uFFFF", "javascript:%EF%BF%BD"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(path_cases); i++) {
+ int url_len = static_cast<int>(strlen(path_cases[i].input));
+ Parsed parsed;
+ ParsePathURL(path_cases[i].input, url_len, true, &parsed);
+
+ Parsed out_parsed;
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizePathURL(path_cases[i].input, url_len, parsed,
+ &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_TRUE(success);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+
+ EXPECT_EQ(0, out_parsed.host.begin);
+ EXPECT_EQ(-1, out_parsed.host.len);
+
+ // When we end with a colon at the end, there should be no path.
+ if (path_cases[i].input[url_len - 1] == ':') {
+ EXPECT_EQ(0, out_parsed.GetContent().begin);
+ EXPECT_EQ(-1, out_parsed.GetContent().len);
+ }
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeMailtoURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ Component expected_path;
+ Component expected_query;
+ } cases[] = {
+ // Null character should be escaped to %00.
+ // Keep this test first in the list as it is handled specially below.
+ {"mailto:addr1\0addr2?foo",
+ "mailto:addr1%00addr2?foo",
+ true, Component(7, 13), Component(21, 3)},
+ {"mailto:addr1",
+ "mailto:addr1",
+ true, Component(7, 5), Component()},
+ {"mailto:addr1@foo.com",
+ "mailto:addr1@foo.com",
+ true, Component(7, 13), Component()},
+ // Trailing whitespace is stripped.
+ {"MaIlTo:addr1 \t ",
+ "mailto:addr1",
+ true, Component(7, 5), Component()},
+ {"MaIlTo:addr1?to=jon",
+ "mailto:addr1?to=jon",
+ true, Component(7, 5), Component(13,6)},
+ {"mailto:addr1,addr2",
+ "mailto:addr1,addr2",
+ true, Component(7, 11), Component()},
+ // Embedded spaces must be encoded.
+ {"mailto:addr1, addr2",
+ "mailto:addr1,%20addr2",
+ true, Component(7, 14), Component()},
+ {"mailto:addr1, addr2?subject=one two ",
+ "mailto:addr1,%20addr2?subject=one%20two",
+ true, Component(7, 14), Component(22, 17)},
+ {"mailto:addr1%2caddr2",
+ "mailto:addr1%2caddr2",
+ true, Component(7, 13), Component()},
+ {"mailto:\xF0\x90\x8C\x80",
+ "mailto:%F0%90%8C%80",
+ true, Component(7, 12), Component()},
+ // Invalid -- UTF-8 encoded surrogate value.
+ {"mailto:\xed\xa0\x80",
+ "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD",
+ false, Component(7, 27), Component()},
+ {"mailto:addr1?",
+ "mailto:addr1?",
+ true, Component(7, 5), Component(13, 0)},
+ // Certain characters have special meanings and must be encoded.
+ {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~",
+ "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~",
+ true, Component(7, 53), Component(61, 47)},
+ };
+
+ // Define outside of loop to catch bugs where components aren't reset
+ Parsed parsed;
+ Parsed out_parsed;
+
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ if (i == 0) {
+ // The first test case purposely has a '\0' in it -- don't count it
+ // as the string terminator.
+ url_len = 22;
+ }
+ ParseMailtoURL(cases[i].input, url_len, &parsed);
+
+ std::string out_str;
+ StdStringCanonOutput output(&out_str);
+ bool success = CanonicalizeMailtoURL(cases[i].input, url_len, parsed,
+ &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(6, out_parsed.scheme.len);
+
+ EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
+ EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
+
+ EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
+ EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
+ }
+}
+
+#ifndef WIN32
+
+TEST(URLCanonTest, _itoa_s) {
+ // We fill the buffer with 0xff to ensure that it's getting properly
+ // null-terminated. We also allocate one byte more than what we tell
+ // _itoa_s about, and ensure that the extra byte is untouched.
+ char buf[6];
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10));
+ EXPECT_STREQ("12", buf);
+ EXPECT_EQ('\xFF', buf[3]);
+
+ // Test the edge cases - exactly the buffer size and one over
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10));
+ EXPECT_STREQ("1234", buf);
+ EXPECT_EQ('\xFF', buf[5]);
+
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10));
+ EXPECT_EQ('\xFF', buf[5]); // should never write to this location
+
+ // Test the template overload (note that this will see the full buffer)
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, _itoa_s(12, buf, 10));
+ EXPECT_STREQ("12", buf);
+ EXPECT_EQ('\xFF', buf[3]);
+
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, _itoa_s(12345, buf, 10));
+ EXPECT_STREQ("12345", buf);
+
+ EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10));
+
+ // Test that radix 16 is supported.
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16));
+ EXPECT_STREQ("4d2", buf);
+ EXPECT_EQ('\xFF', buf[5]);
+}
+
+TEST(URLCanonTest, _itow_s) {
+ // We fill the buffer with 0xff to ensure that it's getting properly
+ // null-terminated. We also allocate one byte more than what we tell
+ // _itoa_s about, and ensure that the extra byte is untouched.
+ gurl_base::char16 buf[6];
+ const char fill_mem = 0xff;
+ const gurl_base::char16 fill_char = 0xffff;
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("12"), gurl_base::string16(buf));
+ EXPECT_EQ(fill_char, buf[3]);
+
+ // Test the edge cases - exactly the buffer size and one over
+ EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("1234"), gurl_base::string16(buf));
+ EXPECT_EQ(fill_char, buf[5]);
+
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(fill_char, buf[5]); // should never write to this location
+
+ // Test the template overload (note that this will see the full buffer)
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, _itow_s(12, buf, 10));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("12"),
+ gurl_base::string16(buf));
+ EXPECT_EQ(fill_char, buf[3]);
+
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, _itow_s(12345, buf, 10));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("12345"), gurl_base::string16(buf));
+
+ EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10));
+}
+
+#endif // !WIN32
+
+// Returns true if the given two structures are the same.
+static bool ParsedIsEqual(const Parsed& a, const Parsed& b) {
+ return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
+ a.username.begin == b.username.begin && a.username.len == b.username.len &&
+ a.password.begin == b.password.begin && a.password.len == b.password.len &&
+ a.host.begin == b.host.begin && a.host.len == b.host.len &&
+ a.port.begin == b.port.begin && a.port.len == b.port.len &&
+ a.path.begin == b.path.begin && a.path.len == b.path.len &&
+ a.query.begin == b.query.begin && a.query.len == b.query.len &&
+ a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
+}
+
+TEST(URLCanonTest, ResolveRelativeURL) {
+ struct RelativeCase {
+ const char* base; // Input base URL: MUST BE CANONICAL
+ bool is_base_hier; // Is the base URL hierarchical
+ bool is_base_file; // Tells us if the base is a file URL.
+ const char* test; // Input URL to test against.
+ bool succeed_relative; // Whether we expect IsRelativeURL to succeed
+ bool is_rel; // Whether we expect |test| to be relative or not.
+ bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed.
+ const char* resolved; // What we expect in the result when resolving.
+ } rel_cases[] = {
+ // Basic absolute input.
+ {"http://host/a", true, false, "http://another/", true, false, false, NULL},
+ {"http://host/a", true, false, "http:////another/", true, false, false, NULL},
+ // Empty relative URLs should only remove the ref part of the URL,
+ // leaving the rest unchanged.
+ {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
+ // Spaces at the ends of the relative path should be ignored.
+ {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"},
+ {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
+ {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"},
+ // Matching schemes without two slashes are treated as relative.
+ {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"},
+ {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"},
+ {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"},
+ {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"},
+ // Nonmatching schemes are absolute.
+ {"http://host/a", true, false, "https:host2", true, false, false, NULL},
+ {"http://host/a", true, false, "htto:/host2", true, false, false, NULL},
+ // Absolute path input
+ {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"},
+ {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"},
+ {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"},
+ {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"},
+ {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"},
+ {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"},
+ // Relative path input
+ {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
+ {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"},
+ {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"},
+ {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
+ {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"},
+ {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"},
+ // Query input
+ {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"},
+ {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"},
+ {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"},
+ // Ref input
+ {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"},
+ {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
+ {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"},
+ // Non-hierarchical base: no relative handling. Relative input should
+ // error, and if a scheme is present, it should be treated as absolute.
+ {"data:foobar", false, false, "baz.html", false, false, false, NULL},
+ {"data:foobar", false, false, "data:baz", true, false, false, NULL},
+ {"data:foobar", false, false, "data:/base", true, false, false, NULL},
+ // Non-hierarchical base: absolute input should succeed.
+ {"data:foobar", false, false, "http://host/", true, false, false, NULL},
+ {"data:foobar", false, false, "http:host", true, false, false, NULL},
+ // Non-hierarchical base: empty URL should give error.
+ {"data:foobar", false, false, "", false, false, false, NULL},
+ // Invalid schemes should be treated as relative.
+ {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
+ {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
+ {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"},
+ {"data:asdf", false, false, ":foo", false, false, false, NULL},
+ {"data:asdf", false, false, "bad(':foo')", false, false, false, NULL},
+ // We should treat semicolons like any other character in URL resolving
+ {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"},
+ {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"},
+ {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"},
+ // Relative URLs can also be written as "//foo/bar" which is relative to
+ // the scheme. In this case, it would take the old scheme, so for http
+ // the example would resolve to "http://foo/bar".
+ {"http://host/a", true, false, "//another", true, true, true, "http://another/"},
+ {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"},
+ {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "//", true, true, false, "http:"},
+ // IE will also allow one or the other to be a backslash to get the same
+ // behavior.
+ {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"},
+#ifdef WIN32
+ // Resolving against Windows file base URLs.
+ {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL},
+ {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
+ {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"},
+ {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"},
+ // But two backslashes on Windows should be UNC so should be treated
+ // as absolute.
+ {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL},
+ // IE doesn't support drive specs starting with two slashes. It fails
+ // immediately and doesn't even try to load. We fix it up to either
+ // an absolute path or UNC depending on what it looks like.
+ {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"},
+ {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"},
+ // Windows drive specs should be allowed and treated as absolute.
+ {"file:///C:/foo", true, true, "c:", true, false, false, NULL},
+ {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL},
+ {"http://host/a", true, false, "c:\\foo", true, false, false, NULL},
+ // Relative paths with drive letters should be allowed when the base is
+ // also a file.
+ {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"},
+ // Treat absolute paths as being off of the drive.
+ {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"},
+ {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"},
+ {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"},
+ // On Windows, two slashes without a drive letter when the base is a file
+ // means that the path is UNC.
+ {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"},
+ {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"},
+#else
+ // On Unix we fall back to relative behavior since there's nothing else
+ // reasonable to do.
+ {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"},
+#endif
+ // Even on Windows, we don't allow relative drive specs when the base
+ // is not file.
+ {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"},
+ {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"},
+ // Ensure that ports aren't allowed for hosts relative to a file url.
+ // Although the result string shows a host:port portion, the call to
+ // resolve the relative URL returns false, indicating parse failure,
+ // which is what is required.
+ {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"},
+ // Filesystem URL tests; filesystem URLs are only valid and relative if
+ // they have no scheme, e.g. "./index.html". There's no valid equivalent
+ // to http:index.html.
+ {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL},
+ {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"},
+ {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"},
+ {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL},
+ // Absolute URLs are still not relative to a non-standard base URL.
+ {"about:blank", false, false, "http://X/A", true, false, true, ""},
+ {"about:blank", false, false, "content://content.Provider/", true, false, true, ""},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(rel_cases); i++) {
+ const RelativeCase& cur_case = rel_cases[i];
+
+ Parsed parsed;
+ int base_len = static_cast<int>(strlen(cur_case.base));
+ if (cur_case.is_base_file)
+ ParseFileURL(cur_case.base, base_len, &parsed);
+ else if (cur_case.is_base_hier)
+ ParseStandardURL(cur_case.base, base_len, &parsed);
+ else
+ ParsePathURL(cur_case.base, base_len, false, &parsed);
+
+ // First see if it is relative.
+ int test_len = static_cast<int>(strlen(cur_case.test));
+ bool is_relative;
+ Component relative_component;
+ bool succeed_is_rel = IsRelativeURL(
+ cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
+ &is_relative, &relative_component);
+
+ EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
+ "succeed is rel failure on " << cur_case.test;
+ EXPECT_EQ(cur_case.is_rel, is_relative) <<
+ "is rel failure on " << cur_case.test;
+ // Now resolve it.
+ if (succeed_is_rel && is_relative && cur_case.is_rel) {
+ std::string resolved;
+ StdStringCanonOutput output(&resolved);
+ Parsed resolved_parsed;
+
+ bool succeed_resolve = ResolveRelativeURL(
+ cur_case.base, parsed, cur_case.is_base_file, cur_case.test,
+ relative_component, NULL, &output, &resolved_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
+ EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
+
+ // Verify that the output parsed structure is the same as parsing a
+ // the URL freshly.
+ Parsed ref_parsed;
+ int resolved_len = static_cast<int>(resolved.size());
+ if (cur_case.is_base_file) {
+ ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
+ } else if (cur_case.is_base_hier) {
+ ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
+ } else {
+ ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed);
+ }
+ EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
+ }
+ }
+}
+
+// It used to be the case that when we did a replacement with a long buffer of
+// UTF-16 characters, we would get invalid data in the URL. This is because the
+// buffer that it used to hold the UTF-8 data was resized, while some pointers
+// were still kept to the old buffer that was removed.
+TEST(URLCanonTest, ReplacementOverflow) {
+ const char src[] = "file:///C:/foo/bar";
+ int src_len = static_cast<int>(strlen(src));
+ Parsed parsed;
+ ParseFileURL(src, src_len, &parsed);
+
+ // Override two components, the path with something short, and the query with
+ // something long enough to trigger the bug.
+ Replacements<gurl_base::char16> repl;
+ gurl_base::string16 new_query;
+ for (int i = 0; i < 4800; i++)
+ new_query.push_back('a');
+
+ gurl_base::string16 new_path(test_utils::TruncateWStringToUTF16(L"/foo"));
+ repl.SetPath(new_path.c_str(), Component(0, 4));
+ repl.SetQuery(new_query.c_str(),
+ Component(0, static_cast<int>(new_query.length())));
+
+ // Call ReplaceComponents on the string. It doesn't matter if we call it for
+ // standard URLs, file URLs, etc, since they will go to the same replacement
+ // function that was buggy.
+ Parsed repl_parsed;
+ std::string repl_str;
+ StdStringCanonOutput repl_output(&repl_str);
+ ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed);
+ repl_output.Complete();
+
+ // Generate the expected string and check.
+ std::string expected("file:///foo?");
+ for (size_t i = 0; i < new_query.length(); i++)
+ expected.push_back('a');
+ EXPECT_TRUE(expected == repl_str);
+}
+
+TEST(URLCanonTest, DefaultPortForScheme) {
+ struct TestCases {
+ const char* scheme;
+ const int expected_port;
+ } cases[]{
+ {"http", 80},
+ {"https", 443},
+ {"ftp", 21},
+ {"ws", 80},
+ {"wss", 443},
+ {"gopher", 70},
+ {"fake-scheme", PORT_UNSPECIFIED},
+ {"HTTP", PORT_UNSPECIFIED},
+ {"HTTPS", PORT_UNSPECIFIED},
+ {"FTP", PORT_UNSPECIFIED},
+ {"WS", PORT_UNSPECIFIED},
+ {"WSS", PORT_UNSPECIFIED},
+ {"GOPHER", PORT_UNSPECIFIED},
+ };
+
+ for (auto& test_case : cases) {
+ SCOPED_TRACE(test_case.scheme);
+ EXPECT_EQ(test_case.expected_port,
+ DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme)));
+ }
+}
+
+TEST(URLCanonTest, IDNToASCII) {
+ RawCanonOutputW<1024> output;
+
+ // Basic ASCII test.
+ gurl_base::string16 str = gurl_base::UTF8ToUTF16("hello");
+ EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("hello"), gurl_base::string16(output.data()));
+ output.set_length(0);
+
+ // Mixed ASCII/non-ASCII.
+ str = gurl_base::UTF8ToUTF16("hellö");
+ EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--hell-8qa"), gurl_base::string16(output.data()));
+ output.set_length(0);
+
+ // All non-ASCII.
+ str = gurl_base::UTF8ToUTF16("你好");
+ EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--6qq79v"), gurl_base::string16(output.data()));
+ output.set_length(0);
+
+ // Characters that need mapping (the resulting Punycode is the encoding for
+ // "1⁄4").
+ str = gurl_base::UTF8ToUTF16("¼");
+ EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--14-c6t"), gurl_base::string16(output.data()));
+ output.set_length(0);
+
+ // String to encode already starts with "xn--", and all ASCII. Should not
+ // modify the string.
+ str = gurl_base::UTF8ToUTF16("xn--hell-8qa");
+ EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
+ EXPECT_EQ(gurl_base::UTF8ToUTF16("xn--hell-8qa"), gurl_base::string16(output.data()));
+ output.set_length(0);
+
+ // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
+ // Should fail, due to a special case: if the label starts with "xn--", it
+ // should be parsed as Punycode, which must be all ASCII.
+ str = gurl_base::UTF8ToUTF16("xn--hellö");
+ EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output));
+ output.set_length(0);
+
+ // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
+ // This tests that there is still an error for the character '⁄' (U+2044),
+ // which would be a valid ASCII character, U+0044, if the high byte were
+ // ignored.
+ str = gurl_base::UTF8ToUTF16("xn--1⁄4");
+ EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output));
+ output.set_length(0);
+}
+
+} // namespace url
diff --git a/url/url_constants.cc b/url/url_constants.cc
new file mode 100644
index 0000000..3540240
--- /dev/null
+++ b/url/url_constants.cc
@@ -0,0 +1,36 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_constants.h"
+
+namespace url {
+
+const char kAboutBlankURL[] = "about:blank";
+const char kAboutSrcdocURL[] = "about:srcdoc";
+
+const char kAboutBlankPath[] = "blank";
+const char kAboutSrcdocPath[] = "srcdoc";
+
+const char kAboutScheme[] = "about";
+const char kBlobScheme[] = "blob";
+const char kContentScheme[] = "content";
+const char kContentIDScheme[] = "cid";
+const char kDataScheme[] = "data";
+const char kFileScheme[] = "file";
+const char kFileSystemScheme[] = "filesystem";
+const char kFtpScheme[] = "ftp";
+const char kGopherScheme[] = "gopher";
+const char kHttpScheme[] = "http";
+const char kHttpsScheme[] = "https";
+const char kJavaScriptScheme[] = "javascript";
+const char kMailToScheme[] = "mailto";
+const char kTelScheme[] = "tel";
+const char kWsScheme[] = "ws";
+const char kWssScheme[] = "wss";
+
+const char kStandardSchemeSeparator[] = "://";
+
+const size_t kMaxURLChars = 2 * 1024 * 1024;
+
+} // namespace url
diff --git a/url/url_constants.h b/url/url_constants.h
new file mode 100644
index 0000000..c077b8d
--- /dev/null
+++ b/url/url_constants.h
@@ -0,0 +1,45 @@
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_CONSTANTS_H_
+#define URL_URL_CONSTANTS_H_
+
+#include <stddef.h>
+
+#include "polyfills/base/component_export.h"
+
+namespace url {
+
+COMPONENT_EXPORT(URL) extern const char kAboutBlankURL[];
+COMPONENT_EXPORT(URL) extern const char kAboutSrcdocURL[];
+
+COMPONENT_EXPORT(URL) extern const char kAboutBlankPath[];
+COMPONENT_EXPORT(URL) extern const char kAboutSrcdocPath[];
+
+COMPONENT_EXPORT(URL) extern const char kAboutScheme[];
+COMPONENT_EXPORT(URL) extern const char kBlobScheme[];
+// The content scheme is specific to Android for identifying a stored file.
+COMPONENT_EXPORT(URL) extern const char kContentScheme[];
+COMPONENT_EXPORT(URL) extern const char kContentIDScheme[];
+COMPONENT_EXPORT(URL) extern const char kDataScheme[];
+COMPONENT_EXPORT(URL) extern const char kFileScheme[];
+COMPONENT_EXPORT(URL) extern const char kFileSystemScheme[];
+COMPONENT_EXPORT(URL) extern const char kFtpScheme[];
+COMPONENT_EXPORT(URL) extern const char kGopherScheme[];
+COMPONENT_EXPORT(URL) extern const char kHttpScheme[];
+COMPONENT_EXPORT(URL) extern const char kHttpsScheme[];
+COMPONENT_EXPORT(URL) extern const char kJavaScriptScheme[];
+COMPONENT_EXPORT(URL) extern const char kMailToScheme[];
+COMPONENT_EXPORT(URL) extern const char kTelScheme[];
+COMPONENT_EXPORT(URL) extern const char kWsScheme[];
+COMPONENT_EXPORT(URL) extern const char kWssScheme[];
+
+// Used to separate a standard scheme and the hostname: "://".
+COMPONENT_EXPORT(URL) extern const char kStandardSchemeSeparator[];
+
+COMPONENT_EXPORT(URL) extern const size_t kMaxURLChars;
+
+} // namespace url
+
+#endif // URL_URL_CONSTANTS_H_
diff --git a/url/url_file.h b/url/url_file.h
new file mode 100644
index 0000000..cfe047e
--- /dev/null
+++ b/url/url_file.h
@@ -0,0 +1,81 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_FILE_H_
+#define URL_URL_FILE_H_
+
+// Provides shared functions used by the internals of the parser and
+// canonicalizer for file URLs. Do not use outside of these modules.
+
+#include "base/strings/string_util.h"
+#include "url/url_parse_internal.h"
+
+namespace url {
+
+#ifdef WIN32
+
+// We allow both "c:" and "c|" as drive identifiers.
+inline bool IsWindowsDriveSeparator(gurl_base::char16 ch) {
+ return ch == ':' || ch == '|';
+}
+
+#endif // WIN32
+
+// Returns the index of the next slash in the input after the given index, or
+// spec_len if the end of the input is reached.
+template<typename CHAR>
+inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) {
+ int idx = begin_index;
+ while (idx < spec_len && !IsURLSlash(spec[idx]))
+ idx++;
+ return idx;
+}
+
+#ifdef WIN32
+
+// Returns true if the start_offset in the given spec looks like it begins a
+// drive spec, for example "c:". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// If this returns true, the spec is guaranteed to have a valid drive letter
+// plus a colon starting at |start_offset|.
+template<typename CHAR>
+inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset,
+ int spec_len) {
+ int remaining_len = spec_len - start_offset;
+ if (remaining_len < 2)
+ return false; // Not enough room.
+ if (!gurl_base::IsAsciiAlpha(spec[start_offset]))
+ return false; // Doesn't start with a valid drive letter.
+ if (!IsWindowsDriveSeparator(spec[start_offset + 1]))
+ return false; // Isn't followed with a drive separator.
+ return true;
+}
+
+// Returns true if the start_offset in the given text looks like it begins a
+// UNC path, for example "\\". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// When strict_slashes is set, this function will only accept backslashes as is
+// standard for Windows. Otherwise, it will accept forward slashes as well
+// which we use for a lot of URL handling.
+template<typename CHAR>
+inline bool DoesBeginUNCPath(const CHAR* text,
+ int start_offset,
+ int len,
+ bool strict_slashes) {
+ int remaining_len = len - start_offset;
+ if (remaining_len < 2)
+ return false;
+
+ if (strict_slashes)
+ return text[start_offset] == '\\' && text[start_offset + 1] == '\\';
+ return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]);
+}
+
+#endif // WIN32
+
+} // namespace url
+
+#endif // URL_URL_FILE_H_
diff --git a/url/url_idna_icu.cc b/url/url_idna_icu.cc
new file mode 100644
index 0000000..b0f91a1
--- /dev/null
+++ b/url/url_idna_icu.cc
@@ -0,0 +1,108 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// ICU-based IDNA converter.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "polyfills/base/logging.h"
+#include "base/no_destructor.h"
+#include <unicode/uidna.h>
+#include <unicode/utypes.h>
+#include "url/url_canon_icu.h"
+#include "url/url_canon_internal.h" // for _itoa_s
+
+namespace url {
+
+namespace {
+
+// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to
+// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().
+//
+// We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned
+// code points allowed) to IDNA 2008 with
+// the backward compatibility in mind. What it does:
+//
+// 1. Use the up-to-date Unicode data.
+// 2. Define a case folding/mapping with the up-to-date Unicode data as
+// in IDNA 2003.
+// 3. Use transitional mechanism for 4 deviation characters (sharp-s,
+// final sigma, ZWJ and ZWNJ) for now.
+// 4. Continue to allow symbols and punctuations.
+// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.
+// 6. Do not apply STD3 rules
+// 7. Do not allow unassigned code points.
+//
+// It also closely matches what IE 10 does except for the BiDi check (
+// http://goo.gl/3XBhqw ).
+// See http://http://unicode.org/reports/tr46/ and references therein
+// for more details.
+struct UIDNAWrapper {
+ UIDNAWrapper() {
+ UErrorCode err = U_ZERO_ERROR;
+ // TODO(jungshik): Change options as different parties (browsers,
+ // registrars, search engines) converge toward a consensus.
+ value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
+ if (U_FAILURE(err)) {
+ GURL_CHECK(false) << "failed to open UTS46 data with error: "
+ << u_errorName(err)
+ << ". If you see this error message in a test environment "
+ << "your test environment likely lacks the required data "
+ << "tables for libicu. See https://crbug.com/778929.";
+ value = NULL;
+ }
+ }
+
+ UIDNA* value;
+};
+
+} // namespace
+
+UIDNA* GetUIDNA() {
+ static gurl_base::NoDestructor<UIDNAWrapper> uidna_wrapper;
+ return uidna_wrapper->value;
+}
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must be ASCII, but is represented as wide characters.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, this will return false. The output in this case is undefined.
+// TODO(jungshik): use UTF-8/ASCII version of nameToASCII.
+// Change the function signature and callers accordingly to avoid unnecessary
+// conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII
+// version with StringByteSink. That way, we can avoid C wrappers and additional
+// string conversion.
+bool IDNToASCII(const gurl_base::char16* src, int src_len, CanonOutputW* output) {
+ GURL_DCHECK(output->length() == 0); // Output buffer is assumed empty.
+
+ UIDNA* uidna = GetUIDNA();
+ GURL_DCHECK(uidna != NULL);
+ while (true) {
+ UErrorCode err = U_ZERO_ERROR;
+ UIDNAInfo info = UIDNA_INFO_INITIALIZER;
+ int output_length = uidna_nameToASCII(uidna, (UChar*)src, src_len, (UChar*)output->data(),
+ output->capacity(), &info, &err);
+ if (U_SUCCESS(err) && info.errors == 0) {
+ output->set_length(output_length);
+ return true;
+ }
+
+ // TODO(jungshik): Look at info.errors to handle them case-by-case basis
+ // if necessary.
+ if (err != U_BUFFER_OVERFLOW_ERROR || info.errors != 0)
+ return false; // Unknown error, give up.
+
+ // Not enough room in our buffer, expand.
+ output->Resize(output_length);
+ }
+}
+
+} // namespace url
diff --git a/url/url_parse_file.cc b/url/url_parse_file.cc
new file mode 100644
index 0000000..b666d0b
--- /dev/null
+++ b/url/url_parse_file.cc
@@ -0,0 +1,222 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "polyfills/base/logging.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_file.h"
+#include "url/url_parse_internal.h"
+
+// Interesting IE file:isms...
+//
+// INPUT OUTPUT
+// ========================= ==============================
+// file:/foo/bar file:///foo/bar
+// The result here seems totally invalid!?!? This isn't UNC.
+//
+// file:/
+// file:// or any other number of slashes
+// IE6 doesn't do anything at all if you click on this link. No error:
+// nothing. IE6's history system seems to always color this link, so I'm
+// guessing that it maps internally to the empty URL.
+//
+// C:\ file:///C:/
+// When on a file: URL source page, this link will work. When over HTTP,
+// the file: URL will appear in the status bar but the link will not work
+// (security restriction for all file URLs).
+//
+// file:foo/ file:foo/ (invalid?!?!?)
+// file:/foo/ file:///foo/ (invalid?!?!?)
+// file://foo/ file://foo/ (UNC to server "foo")
+// file:///foo/ file:///foo/ (invalid, seems to be a file)
+// file:////foo/ file://foo/ (UNC to server "foo")
+// Any more than four slashes is also treated as UNC.
+//
+// file:C:/ file://C:/
+// file:/C:/ file://C:/
+// The number of slashes after "file:" don't matter if the thing following
+// it looks like an absolute drive path. Also, slashes and backslashes are
+// equally valid here.
+
+namespace url {
+
+namespace {
+
+// A subcomponent of DoInitFileURL, the input of this function should be a UNC
+// path name, with the index of the first character after the slashes following
+// the scheme given in |after_slashes|. This will initialize the host, path,
+// query, and ref, and leave the other output components untouched
+// (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseUNC(const CHAR* spec,
+ int after_slashes,
+ int spec_len,
+ Parsed* parsed) {
+ int next_slash = FindNextSlash(spec, after_slashes, spec_len);
+ if (next_slash == spec_len) {
+ // No additional slash found, as in "file://foo", treat the text as the
+ // host with no path (this will end up being UNC to server "foo").
+ int host_len = spec_len - after_slashes;
+ if (host_len)
+ parsed->host = Component(after_slashes, host_len);
+ else
+ parsed->host.reset();
+ parsed->path.reset();
+ return;
+ }
+
+#ifdef WIN32
+ // See if we have something that looks like a path following the first
+ // component. As in "file://localhost/c:/", we get "c:/" out. We want to
+ // treat this as a having no host but the path given. Works on Windows only.
+ if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
+ parsed->host.reset();
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+ return;
+ }
+#endif
+
+ // Otherwise, everything up until that first slash we found is the host name,
+ // which will end up being the UNC host. For example "file://foo/bar.txt"
+ // will get a server name of "foo" and a path of "/bar". Later, on Windows,
+ // this should be treated as the filename "\\foo\bar.txt" in proper UNC
+ // notation.
+ int host_len = next_slash - after_slashes;
+ if (host_len)
+ parsed->host = MakeRange(after_slashes, next_slash);
+ else
+ parsed->host.reset();
+ if (next_slash < spec_len) {
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+ } else {
+ parsed->path.reset();
+ }
+}
+
+// A subcomponent of DoParseFileURL, the input should be a local file, with the
+// beginning of the path indicated by the index in |path_begin|. This will
+// initialize the host, path, query, and ref, and leave the other output
+// components untouched (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseLocalFile(const CHAR* spec,
+ int path_begin,
+ int spec_len,
+ Parsed* parsed) {
+ parsed->host.reset();
+ ParsePathInternal(spec, MakeRange(path_begin, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// Backend for the external functions that operates on either char type.
+// Handles cases where there is a scheme, but also when handed the first
+// character following the "file:" at the beginning of the spec. If so,
+// this is usually a slash, but needn't be; we allow paths like "file:c:\foo".
+template<typename CHAR>
+void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ GURL_DCHECK(spec_len >= 0);
+
+ // Get the parts we never use for file URLs out of the way.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->port.reset();
+
+ // Many of the code paths don't set these, so it's convenient to just clear
+ // them. We'll write them in those cases we need them.
+ parsed->query.reset();
+ parsed->ref.reset();
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Find the scheme, if any.
+ int num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
+ int after_scheme;
+ int after_slashes;
+#ifdef WIN32
+ // See how many slashes there are. We want to handle cases like UNC but also
+ // "/c:/foo". This is when there is no scheme, so we can allow pages to do
+ // links like "c:/foo/bar" or "//foo/bar". This is also called by the
+ // relative URL resolver when it determines there is an absolute URL, which
+ // may give us input like "/c:/foo".
+ after_slashes = begin + num_slashes;
+ if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
+ // Windows path, don't try to extract the scheme (for example, "c:\foo").
+ parsed->scheme.reset();
+ after_scheme = after_slashes;
+ } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
+ // Windows UNC path: don't try to extract the scheme, but keep the slashes.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ } else
+#endif
+ {
+ // ExtractScheme doesn't understand the possibility of filenames with
+ // colons in them, in which case it returns the entire spec up to the
+ // colon as the scheme. So handle /foo.c:5 as a file but foo.c:5 as
+ // the foo.c: scheme.
+ if (!num_slashes &&
+ ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+ after_scheme = parsed->scheme.end() + 1;
+ } else {
+ // No scheme found, remember that.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ }
+ }
+
+ // Handle empty specs ones that contain only whitespace or control chars,
+ // or that are just the scheme (for example "file:").
+ if (after_scheme == spec_len) {
+ parsed->host.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+ after_slashes = after_scheme + num_slashes;
+#ifdef WIN32
+ // Check whether the input is a drive again. We checked above for windows
+ // drive specs, but that's only at the very beginning to see if we have a
+ // scheme at all. This test will be duplicated in that case, but will
+ // additionally handle all cases with a real scheme such as "file:///C:/".
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
+ num_slashes != 3) {
+ // Anything not beginning with a drive spec ("c:\") on Windows is treated
+ // as UNC, with the exception of three slashes which always means a file.
+ // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
+ return;
+ }
+#else
+ // file: URL with exactly 2 slashes is considered to have a host component.
+ if (num_slashes == 2) {
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
+ return;
+ }
+#endif // WIN32
+
+ // Easy and common case, the full path immediately follows the scheme
+ // (modulo slashes), as in "file://c:/foo". Just treat everything from
+ // there to the end as the path. Empty hosts have 0 length instead of -1.
+ // We include the last slash as part of the path if there is one.
+ DoParseLocalFile(spec,
+ num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
+ spec_len, parsed);
+}
+
+} // namespace
+
+void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseFileURL(url, url_len, parsed);
+}
+
+void ParseFileURL(const gurl_base::char16* url, int url_len, Parsed* parsed) {
+ DoParseFileURL(url, url_len, parsed);
+}
+
+} // namespace url
diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h
new file mode 100644
index 0000000..6f86d86
--- /dev/null
+++ b/url/url_parse_internal.h
@@ -0,0 +1,91 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_PARSE_INTERNAL_H_
+#define URL_URL_PARSE_INTERNAL_H_
+
+// Contains common inline helper functions used by the URL parsing routines.
+
+#include "url/third_party/mozilla/url_parse.h"
+
+namespace url {
+
+// We treat slashes and backslashes the same for IE compatibility.
+inline bool IsURLSlash(gurl_base::char16 ch) {
+ return ch == '/' || ch == '\\';
+}
+
+// Returns true if we should trim this character from the URL because it is a
+// space or a control character.
+inline bool ShouldTrimFromURL(gurl_base::char16 ch) {
+ return ch <= ' ';
+}
+
+// Given an already-initialized begin index and length, this shrinks the range
+// to eliminate "should-be-trimmed" characters. Note that the length does *not*
+// indicate the length of untrimmed data from |*begin|, but rather the position
+// in the input string (so the string starts at character |*begin| in the spec,
+// and goes until |*len|).
+template<typename CHAR>
+inline void TrimURL(const CHAR* spec, int* begin, int* len,
+ bool trim_path_end = true) {
+ // Strip leading whitespace and control characters.
+ while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
+ (*begin)++;
+
+ if (trim_path_end) {
+ // Strip trailing whitespace and control characters. We need the >i test
+ // for when the input string is all blanks; we don't want to back past the
+ // input.
+ while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
+ (*len)--;
+ }
+}
+
+// Counts the number of consecutive slashes starting at the given offset
+// in the given string of the given length.
+template<typename CHAR>
+inline int CountConsecutiveSlashes(const CHAR *str,
+ int begin_offset, int str_len) {
+ int count = 0;
+ while (begin_offset + count < str_len &&
+ IsURLSlash(str[begin_offset + count]))
+ ++count;
+ return count;
+}
+
+// Internal functions in url_parse.cc that parse the path, that is, everything
+// following the authority section. The input is the range of everything
+// following the authority section, and the output is the identified ranges.
+//
+// This is designed for the file URL parser or other consumers who may do
+// special stuff at the beginning, but want regular path parsing, it just
+// maps to the internal parsing function for paths.
+void ParsePathInternal(const char* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref);
+void ParsePathInternal(const gurl_base::char16* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref);
+
+
+// Given a spec and a pointer to the character after the colon following the
+// scheme, this parses it and fills in the structure, Every item in the parsed
+// structure is filled EXCEPT for the scheme, which is untouched.
+void ParseAfterScheme(const char* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed);
+void ParseAfterScheme(const gurl_base::char16* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed);
+
+} // namespace url
+
+#endif // URL_URL_PARSE_INTERNAL_H_
diff --git a/url/url_parse_perftest.cc b/url/url_parse_perftest.cc
new file mode 100644
index 0000000..82c7693
--- /dev/null
+++ b/url/url_parse_perftest.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/string_piece.h"
+#include "base/test/perf_time_logger.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+
+namespace {
+
+TEST(URLParse, FullURL) {
+ constexpr gurl_base::StringPiece kUrl =
+ "http://me:pass@host/foo/bar.html;param?query=yes#ref";
+
+ url::Parsed parsed;
+ gurl_base::PerfTimeLogger timer("Full_URL_Parse_AMillion");
+
+ for (int i = 0; i < 1000000; i++)
+ url::ParseStandardURL(kUrl.data(), kUrl.size(), &parsed);
+ timer.Done();
+}
+
+constexpr gurl_base::StringPiece kTypicalUrl1 =
+ "http://www.google.com/"
+ "search?q=url+parsing&ie=utf-8&oe=utf-8&aq=t&rls=org.mozilla:en-US:"
+ "official&client=firefox-a";
+
+constexpr gurl_base::StringPiece kTypicalUrl2 =
+ "http://www.amazon.com/Stephen-King-Thrillers-Horror-People/dp/0766012336/"
+ "ref=sr_1_2/133-4144931-4505264?ie=UTF8&s=books&qid=2144880915&sr=8-2";
+
+constexpr gurl_base::StringPiece kTypicalUrl3 =
+ "http://store.apple.com/1-800-MY-APPLE/WebObjects/AppleStore.woa/wa/"
+ "RSLID?nnmm=browse&mco=578E9744&node=home/desktop/mac_pro";
+
+TEST(URLParse, TypicalURLParse) {
+ url::Parsed parsed1;
+ url::Parsed parsed2;
+ url::Parsed parsed3;
+
+ // Do this 1/3 of a million times since we do 3 different URLs.
+ gurl_base::PerfTimeLogger parse_timer("Typical_URL_Parse_AMillion");
+ for (int i = 0; i < 333333; i++) {
+ url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1);
+ url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2);
+ url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3);
+ }
+ parse_timer.Done();
+}
+
+// Includes both parsing and canonicalization with no mallocs.
+TEST(URLParse, TypicalURLParseCanon) {
+ url::Parsed parsed1;
+ url::Parsed parsed2;
+ url::Parsed parsed3;
+
+ gurl_base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion");
+ url::Parsed out_parsed;
+ url::RawCanonOutput<1024> output;
+ for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M
+ url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1);
+ output.set_length(0);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output,
+ &out_parsed);
+
+ url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2);
+ output.set_length(0);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output,
+ &out_parsed);
+
+ url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3);
+ output.set_length(0);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output,
+ &out_parsed);
+ }
+ canon_timer.Done();
+}
+
+// Includes both parsing and canonicalization, and mallocs for the output.
+TEST(URLParse, TypicalURLParseCanonStdString) {
+ url::Parsed parsed1;
+ url::Parsed parsed2;
+ url::Parsed parsed3;
+
+ gurl_base::PerfTimeLogger canon_timer("Typical_Parse_Canon_AMillion");
+ url::Parsed out_parsed;
+ for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M
+ url::ParseStandardURL(kTypicalUrl1.data(), kTypicalUrl1.size(), &parsed1);
+ std::string out1;
+ url::StdStringCanonOutput output1(&out1);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl1.data(), kTypicalUrl1.size(), parsed1,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output1,
+ &out_parsed);
+
+ url::ParseStandardURL(kTypicalUrl2.data(), kTypicalUrl2.size(), &parsed2);
+ std::string out2;
+ url::StdStringCanonOutput output2(&out2);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl2.data(), kTypicalUrl2.size(), parsed2,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output2,
+ &out_parsed);
+
+ url::ParseStandardURL(kTypicalUrl3.data(), kTypicalUrl3.size(), &parsed3);
+ std::string out3;
+ url::StdStringCanonOutput output3(&out3);
+ url::CanonicalizeStandardURL(
+ kTypicalUrl3.data(), kTypicalUrl3.size(), parsed3,
+ url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr, &output3,
+ &out_parsed);
+ }
+ canon_timer.Done();
+}
+
+TEST(URLParse, GURL) {
+ gurl_base::PerfTimeLogger gurl_timer("Typical_GURL_AMillion");
+ for (int i = 0; i < 333333; i++) { // divide by 3 so we get 1M
+ GURL gurl1(kTypicalUrl1);
+ GURL gurl2(kTypicalUrl2);
+ GURL gurl3(kTypicalUrl3);
+ }
+ gurl_timer.Done();
+}
+
+} // namespace
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
new file mode 100644
index 0000000..a1c38c2
--- /dev/null
+++ b/url/url_parse_unittest.cc
@@ -0,0 +1,690 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/third_party/mozilla/url_parse.h"
+
+#include <stddef.h>
+
+#include "base/stl_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/third_party/mozilla/url_parse.h"
+
+// Interesting IE file:isms...
+//
+// file:/foo/bar file:///foo/bar
+// The result here seems totally invalid!?!? This isn't UNC.
+//
+// file:/
+// file:// or any other number of slashes
+// IE6 doesn't do anything at all if you click on this link. No error:
+// nothing. IE6's history system seems to always color this link, so I'm
+// guessing that it maps internally to the empty URL.
+//
+// C:\ file:///C:/
+// / file:///C:/
+// /foo file:///C:/foo
+// Interestingly, IE treats "/" as an alias for "c:\", which makes sense,
+// but is weird to think about on Windows.
+//
+// file:foo/ file:foo/ (invalid?!?!?)
+// file:/foo/ file:///foo/ (invalid?!?!?)
+// file://foo/ file://foo/ (UNC to server "foo")
+// file:///foo/ file:///foo/ (invalid)
+// file:////foo/ file://foo/ (UNC to server "foo")
+// Any more than four slashes is also treated as UNC.
+//
+// file:C:/ file://C:/
+// file:/C:/ file://C:/
+// The number of slashes after "file:" don't matter if the thing following
+// it looks like an absolute drive path. Also, slashes and backslashes are
+// equally valid here.
+
+namespace url {
+namespace {
+
+// Used for regular URL parse cases.
+struct URLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ int port;
+ const char* path;
+ const char* query;
+ const char* ref;
+};
+
+// Simpler version of URLParseCase for testing path URLs.
+struct PathURLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* path;
+};
+
+// Simpler version of URLParseCase for testing mailto URLs.
+struct MailtoURLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* path;
+ const char* query;
+};
+
+// More complicated version of URLParseCase for testing filesystem URLs.
+struct FileSystemURLParseCase {
+ const char* input;
+
+ const char* inner_scheme;
+ const char* inner_username;
+ const char* inner_password;
+ const char* inner_host;
+ int inner_port;
+ const char* inner_path;
+ const char* path;
+ const char* query;
+ const char* ref;
+};
+
+bool ComponentMatches(const char* input,
+ const char* reference,
+ const Component& component) {
+ // If the component is nonexistent (length == -1), it should begin at 0.
+ EXPECT_TRUE(component.len >= 0 || component.len == -1);
+
+ // Begin should be valid.
+ EXPECT_LE(0, component.begin);
+
+ // A NULL reference means the component should be nonexistent.
+ if (!reference)
+ return component.len == -1;
+ if (component.len < 0)
+ return false; // Reference is not NULL but we don't have anything
+
+ if (strlen(reference) != static_cast<size_t>(component.len))
+ return false; // Lengths don't match
+
+ // Now check the actual characters.
+ return strncmp(reference, &input[component.begin], component.len) == 0;
+}
+
+void ExpectInvalidComponent(const Component& component) {
+ EXPECT_EQ(0, component.begin);
+ EXPECT_EQ(-1, component.len);
+}
+
+// Parsed ----------------------------------------------------------------------
+
+TEST(URLParser, Length) {
+ const char* length_cases[] = {
+ // One with everything in it.
+ "http://user:pass@host:99/foo?bar#baz",
+ // One with nothing in it.
+ "",
+ // Working backwards, let's start taking off stuff from the full one.
+ "http://user:pass@host:99/foo?bar#",
+ "http://user:pass@host:99/foo?bar",
+ "http://user:pass@host:99/foo?",
+ "http://user:pass@host:99/foo",
+ "http://user:pass@host:99/",
+ "http://user:pass@host:99",
+ "http://user:pass@host:",
+ "http://user:pass@host",
+ "http://host",
+ "http://user@",
+ "http:",
+ };
+ for (size_t i = 0; i < gurl_base::size(length_cases); i++) {
+ int true_length = static_cast<int>(strlen(length_cases[i]));
+
+ Parsed parsed;
+ ParseStandardURL(length_cases[i], true_length, &parsed);
+
+ EXPECT_EQ(true_length, parsed.Length());
+ }
+}
+
+TEST(URLParser, CountCharactersBefore) {
+ struct CountCase {
+ const char* url;
+ Parsed::ComponentType component;
+ bool include_delimiter;
+ int expected_count;
+ } count_cases[] = {
+ // Test each possibility in the case where all components are present.
+ // 0 1 2
+ // 0123456789012345678901
+ {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0},
+ {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0},
+ {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7},
+ {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7},
+ {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9},
+ {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9},
+ {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11},
+ {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11},
+ {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12},
+ {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13},
+ {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14},
+ {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14},
+ {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16},
+ {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17},
+ {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18},
+ {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19},
+ // Now test when the requested component is missing.
+ {"http://u:p@h:8/p?", Parsed::REF, true, 17},
+ {"http://u:p@h:8/p?q", Parsed::REF, true, 18},
+ {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16},
+ {"http://u:p@h:8#r", Parsed::PATH, true, 14},
+ {"http://u:p@h/", Parsed::PORT, true, 12},
+ {"http://u:p@/", Parsed::HOST, true, 11},
+ // This case is a little weird. It will report that the password would
+ // start where the host begins. This is arguably correct, although you
+ // could also argue that it should start at the '@' sign. Doing it
+ // starting with the '@' sign is actually harder, so we don't bother.
+ {"http://u@h/", Parsed::PASSWORD, true, 9},
+ {"http://h/", Parsed::USERNAME, true, 7},
+ {"http:", Parsed::USERNAME, true, 5},
+ {"", Parsed::SCHEME, true, 0},
+ // Make sure a random component still works when there's nothing there.
+ {"", Parsed::REF, true, 0},
+ // File URLs are special with no host, so we test those.
+ {"file:///c:/foo", Parsed::USERNAME, true, 7},
+ {"file:///c:/foo", Parsed::PASSWORD, true, 7},
+ {"file:///c:/foo", Parsed::HOST, true, 7},
+ {"file:///c:/foo", Parsed::PATH, true, 7},
+ };
+ for (size_t i = 0; i < gurl_base::size(count_cases); i++) {
+ int length = static_cast<int>(strlen(count_cases[i].url));
+
+ // Simple test to distinguish file and standard URLs.
+ Parsed parsed;
+ if (length > 0 && count_cases[i].url[0] == 'f')
+ ParseFileURL(count_cases[i].url, length, &parsed);
+ else
+ ParseStandardURL(count_cases[i].url, length, &parsed);
+
+ int chars_before = parsed.CountCharactersBefore(
+ count_cases[i].component, count_cases[i].include_delimiter);
+ EXPECT_EQ(count_cases[i].expected_count, chars_before);
+ }
+}
+
+// Standard --------------------------------------------------------------------
+
+// Input Scheme Usrname Passwd Host Port Path Query Ref
+// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ -----
+static URLParseCase cases[] = {
+ // Regular URL with all the parts
+{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"},
+
+ // Known schemes should lean towards authority identification
+{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+
+ // Spaces!
+{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL},
+{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"},
+
+ // Invalid port numbers should be identified and turned into -2, empty port
+ // numbers should be -1. Spaces aren't allowed in port numbers
+{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL},
+{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL},
+{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL},
+{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL},
+{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"},
+
+ // Creative URLs missing key elements
+{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL},
+{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL},
+{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL},
+{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""},
+{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""},
+{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"},
+{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"},
+{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"},
+{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL},
+{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL},
+{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL},
+{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL},
+{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+
+ // Username/passwords and things that look like them
+{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL},
+{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL},
+ // ... "]" in the password field isn't allowed, but we tolerate it here...
+{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL},
+{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL},
+{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL},
+
+{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL},
+{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL},
+{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL},
+
+ // Tolerate different numbers of slashes.
+{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL},
+{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL},
+{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL},
+
+ // Raw file paths on Windows aren't handled by the parser.
+{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL},
+{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL},
+
+ // Use the first question mark for the query and the ref.
+{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"},
+{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL},
+{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"},
+
+ // IPv6, check also interesting uses of colons.
+{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL},
+{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL},
+{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL},
+{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL},
+
+ // Partially-complete IPv6 literals, and related cases.
+{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL},
+{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL},
+{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL},
+{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL},
+{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL},
+{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL},
+{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL},
+
+};
+
+TEST(URLParser, Standard) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the constructor.
+ Parsed parsed;
+ for (size_t i = 0; i < gurl_base::size(cases); i++) {
+ const char* url = cases[i].input;
+ ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host));
+ EXPECT_EQ(cases[i].port, port);
+ EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref));
+ }
+}
+
+// PathURL --------------------------------------------------------------------
+
+// Various incarnations of path URLs.
+static PathURLParseCase path_cases[] = {
+{"", NULL, NULL},
+{":", "", NULL},
+{":/", "", "/"},
+{"/", NULL, "/"},
+{" This is \\interesting// \t", NULL, "This is \\interesting// \t"},
+{"about:", "about", NULL},
+{"about:blank", "about", "blank"},
+{" about: blank ", "about", " blank "},
+{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\"); "},
+};
+
+TEST(URLParser, PathURL) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the constructor.
+ Parsed parsed;
+ for (size_t i = 0; i < gurl_base::size(path_cases); i++) {
+ const char* url = path_cases[i].input;
+ ParsePathURL(url, static_cast<int>(strlen(url)), false, &parsed);
+
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme))
+ << i;
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent()))
+ << i;
+
+ // The remaining components are never used for path URLs.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.host);
+ ExpectInvalidComponent(parsed.port);
+ }
+}
+
+// Various incarnations of file URLs.
+static URLParseCase file_cases[] = {
+#ifdef WIN32
+{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL},
+{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL},
+{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL},
+{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL},
+{"file://localhost/c:/", "file", NULL, NULL, NULL, -1, "/c:/", NULL, NULL},
+{"file://127.0.0.1/c|\\", "file", NULL, NULL, NULL, -1, "/c|\\", NULL, NULL},
+{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+ // If there is a Windows drive letter, treat any number of slashes as the
+ // path part.
+{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL},
+{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL},
+{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL},
+{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL},
+{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL},
+ // If there is not a drive letter, we should treat is as UNC EXCEPT for
+ // three slashes, which we treat as a Unix style path.
+{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL},
+{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL},
+{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+ // Queries and refs are valid for file URLs as well.
+{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""},
+{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"},
+#else // WIN32
+ // No slashes.
+ {"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+ {"file:path", "file", NULL, NULL, NULL, -1, "path", NULL, NULL},
+ {"file:path/", "file", NULL, NULL, NULL, -1, "path/", NULL, NULL},
+ {"file:path/f.txt", "file", NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL},
+ // One slash.
+ {"file:/", "file", NULL, NULL, NULL, -1, "/", NULL, NULL},
+ {"file:/path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL},
+ {"file:/path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL},
+ {"file:/path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ // Two slashes.
+ {"file://", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+ {"file://server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL},
+ {"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL},
+ {"file://server/f.txt", "file", NULL, NULL, "server", -1, "/f.txt", NULL, NULL},
+ // Three slashes.
+ {"file:///", "file", NULL, NULL, NULL, -1, "/", NULL, NULL},
+ {"file:///path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL},
+ {"file:///path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL},
+ {"file:///path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ // More than three slashes.
+ {"file:////", "file", NULL, NULL, NULL, -1, "/", NULL, NULL},
+ {"file:////path", "file", NULL, NULL, NULL, -1, "/path", NULL, NULL},
+ {"file:////path/", "file", NULL, NULL, NULL, -1, "/path/", NULL, NULL},
+ {"file:////path/f.txt", "file", NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ // Schemeless URLs
+ {"path/f.txt", NULL, NULL, NULL, NULL, -1, "path/f.txt", NULL, NULL},
+ {"path:80/f.txt", "path", NULL, NULL, NULL, -1, "80/f.txt", NULL, NULL},
+ {"path/f.txt:80", "path/f.txt",NULL, NULL, NULL, -1, "80", NULL, NULL}, // Wrong.
+ {"/path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ {"/path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL},
+ {"/path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL},
+ {"//server/f.txt", NULL, NULL, NULL, "server", -1, "/f.txt", NULL, NULL},
+ {"//server:80/f.txt", NULL, NULL, NULL, "server:80",-1, "/f.txt", NULL, NULL},
+ {"//server/f.txt:80", NULL, NULL, NULL, "server", -1, "/f.txt:80", NULL, NULL},
+ {"///path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ {"///path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL},
+ {"///path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL},
+ {"////path/f.txt", NULL, NULL, NULL, NULL, -1, "/path/f.txt", NULL, NULL},
+ {"////path:80/f.txt", NULL, NULL, NULL, NULL, -1, "/path:80/f.txt",NULL, NULL},
+ {"////path/f.txt:80", NULL, NULL, NULL, NULL, -1, "/path/f.txt:80",NULL, NULL},
+ // Queries and refs are valid for file URLs as well.
+ {"file:///foo.html?#", "file", NULL, NULL, NULL, -1, "/foo.html", "", ""},
+ {"file:///foo.html?q=y#ref", "file", NULL, NULL, NULL, -1, "/foo.html", "q=y", "ref"},
+#endif // WIN32
+};
+
+TEST(URLParser, ParseFileURL) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the construtor.
+ Parsed parsed;
+ for (size_t i = 0; i < gurl_base::size(file_cases); i++) {
+ const char* url = file_cases[i].input;
+ ParseFileURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.scheme.begin << ", " << parsed.scheme.len;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.username.begin << ", " << parsed.username.len;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.password.begin << ", " << parsed.password.len;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.host.begin << ", " << parsed.host.len;
+
+ EXPECT_EQ(file_cases[i].port, port)
+ << " for case #" << i << " [ " << url << "] " << port;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.path.begin << ", " << parsed.path.len;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query))
+ << " for case #" << i << " [" << url << "] "
+ << parsed.query.begin << ", " << parsed.query.len;
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref))
+ << " for case #" << i << " [ "<< url << "] "
+ << parsed.query.begin << ", " << parsed.scheme.len;
+ }
+}
+
+
+TEST(URLParser, ExtractFileName) {
+ struct FileCase {
+ const char* input;
+ const char* expected;
+ } file_cases[] = {
+ {"http://www.google.com", NULL},
+ {"http://www.google.com/", ""},
+ {"http://www.google.com/search", "search"},
+ {"http://www.google.com/search/", ""},
+ {"http://www.google.com/foo/bar.html?baz=22", "bar.html"},
+ {"http://www.google.com/foo/bar.html#ref", "bar.html"},
+ {"http://www.google.com/search/;param", ""},
+ {"http://www.google.com/foo/bar.html;param#ref", "bar.html"},
+ {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html"},
+ {"http://www.google.com/foo/bar.html?query#ref", "bar.html"},
+ {"http://www.google.com/foo;/bar.html", "bar.html"},
+ {"http://www.google.com/foo;/", ""},
+ {"http://www.google.com/foo;", "foo"},
+ {"http://www.google.com/;", ""},
+ {"http://www.google.com/foo;bar;html", "foo"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(file_cases); i++) {
+ const char* url = file_cases[i].input;
+ int len = static_cast<int>(strlen(url));
+
+ Parsed parsed;
+ ParseStandardURL(url, len, &parsed);
+
+ Component file_name;
+ ExtractFileName(url, parsed.path, &file_name);
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name));
+ }
+}
+
+// Returns true if the parameter with index |parameter| in the given URL's
+// query string. The expected key can be NULL to indicate no such key index
+// should exist. The parameter number is 1-based.
+static bool NthParameterIs(const char* url,
+ int parameter,
+ const char* expected_key,
+ const char* expected_value) {
+ Parsed parsed;
+ ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
+
+ Component query = parsed.query;
+
+ for (int i = 1; i <= parameter; i++) {
+ Component key, value;
+ if (!ExtractQueryKeyValue(url, &query, &key, &value)) {
+ if (parameter >= i && !expected_key)
+ return true; // Expected nonexistent key, got one.
+ return false; // Not enough keys.
+ }
+
+ if (i == parameter) {
+ if (!expected_key)
+ return false;
+
+ if (strncmp(&url[key.begin], expected_key, key.len) != 0)
+ return false;
+ if (strncmp(&url[value.begin], expected_value, value.len) != 0)
+ return false;
+ return true;
+ }
+ }
+ return expected_key == NULL; // We didn't find that many parameters.
+}
+
+TEST(URLParser, ExtractQueryKeyValue) {
+ EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL));
+
+ // Basic case.
+ char a[] = "http://www.google.com?arg1=1&arg2=2&bar";
+ EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1"));
+ EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2"));
+ EXPECT_TRUE(NthParameterIs(a, 3, "bar", ""));
+ EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL));
+
+ // Empty param at the end.
+ char b[] = "http://www.google.com?foo=bar&";
+ EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar"));
+ EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL));
+
+ // Empty param at the beginning.
+ char c[] = "http://www.google.com?&foo=bar";
+ EXPECT_TRUE(NthParameterIs(c, 1, "", ""));
+ EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar"));
+ EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL));
+
+ // Empty key with value.
+ char d[] = "http://www.google.com?=foo";
+ EXPECT_TRUE(NthParameterIs(d, 1, "", "foo"));
+ EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL));
+
+ // Empty value with key.
+ char e[] = "http://www.google.com?foo=";
+ EXPECT_TRUE(NthParameterIs(e, 1, "foo", ""));
+ EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL));
+
+ // Empty key and values.
+ char f[] = "http://www.google.com?&&==&=";
+ EXPECT_TRUE(NthParameterIs(f, 1, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 2, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 3, "", "="));
+ EXPECT_TRUE(NthParameterIs(f, 4, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL));
+}
+
+// MailtoURL --------------------------------------------------------------------
+
+static MailtoURLParseCase mailto_cases[] = {
+//|input |scheme |path |query
+{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL},
+{" mailto: to \t", "mailto", " to", NULL},
+{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL},
+{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL},
+{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL},
+{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"},
+{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"},
+{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"},
+{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",},
+{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"},
+};
+
+TEST(URLParser, MailtoUrl) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the constructor.
+ Parsed parsed;
+ for (size_t i = 0; i < gurl_base::size(mailto_cases); ++i) {
+ const char* url = mailto_cases[i].input;
+ ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query));
+ EXPECT_EQ(PORT_UNSPECIFIED, port);
+
+ // The remaining components are never used for mailto URLs.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.port);
+ ExpectInvalidComponent(parsed.ref);
+ }
+}
+
+// Various incarnations of filesystem URLs.
+static FileSystemURLParseCase filesystem_cases[] = {
+ // Regular URL with all the parts
+{"filesystem:http://user:pass@foo:21/temporary/bar;par?b#c", "http", "user", "pass", "foo", 21, "/temporary", "/bar;par", "b", "c"},
+{"filesystem:https://foo/persistent/bar;par/", "https", NULL, NULL, "foo", -1, "/persistent", "/bar;par/", NULL, NULL},
+{"filesystem:file:///persistent/bar;par/", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", NULL, NULL},
+{"filesystem:file:///persistent/bar;par/?query#ref", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", "query", "ref"},
+{"filesystem:file:///persistent", "file", NULL, NULL, NULL, -1, "/persistent", "", NULL, NULL},
+};
+
+TEST(URLParser, FileSystemURL) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the constructor.
+ Parsed parsed;
+ for (size_t i = 0; i < gurl_base::size(filesystem_cases); i++) {
+ const FileSystemURLParseCase* parsecase = &filesystem_cases[i];
+ const char* url = parsecase->input;
+ ParseFileSystemURL(url, static_cast<int>(strlen(url)), &parsed);
+
+ EXPECT_TRUE(ComponentMatches(url, "filesystem", parsed.scheme));
+ EXPECT_EQ(!parsecase->inner_scheme, !parsed.inner_parsed());
+ // Only check the inner_parsed if there is one.
+ if (parsed.inner_parsed()) {
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_scheme,
+ parsed.inner_parsed()->scheme));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_username,
+ parsed.inner_parsed()->username));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_password,
+ parsed.inner_parsed()->password));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_host,
+ parsed.inner_parsed()->host));
+ int port = ParsePort(url, parsed.inner_parsed()->port);
+ EXPECT_EQ(parsecase->inner_port, port);
+
+ // The remaining components are never used for filesystem URLs.
+ ExpectInvalidComponent(parsed.inner_parsed()->query);
+ ExpectInvalidComponent(parsed.inner_parsed()->ref);
+ }
+
+ EXPECT_TRUE(ComponentMatches(url, parsecase->path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref));
+
+ // The remaining components are never used for filesystem URLs.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.host);
+ ExpectInvalidComponent(parsed.port);
+ }
+}
+
+} // namespace
+} // namespace url
diff --git a/url/url_test_utils.h b/url/url_test_utils.h
new file mode 100644
index 0000000..f8d40e1
--- /dev/null
+++ b/url/url_test_utils.h
@@ -0,0 +1,40 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_TEST_UTILS_H_
+#define URL_URL_TEST_UTILS_H_
+
+// Convenience functions for string conversions.
+// These are mostly intended for use in unit tests.
+
+#include <string>
+
+#include "base/strings/string16.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/url_canon_internal.h"
+
+namespace url {
+
+namespace test_utils {
+
+// Converts a UTF-16 string from native wchar_t format to char16 by
+// truncating the high 32 bits. This is different than the conversion function
+// in base bacause it passes invalid UTF-16 characters which is important for
+// test purposes. As a result, this is not meant to handle true UTF-32 encoded
+// strings.
+inline gurl_base::string16 TruncateWStringToUTF16(const wchar_t* src) {
+ gurl_base::string16 str;
+ int length = static_cast<int>(wcslen(src));
+ for (int i = 0; i < length; ++i) {
+ str.push_back(static_cast<gurl_base::char16>(src[i]));
+ }
+ return str;
+}
+
+} // namespace test_utils
+
+} // namespace url
+
+#endif // URL_URL_TEST_UTILS_H_
diff --git a/url/url_util.cc b/url/url_util.cc
new file mode 100644
index 0000000..47fc499
--- /dev/null
+++ b/url/url_util.cc
@@ -0,0 +1,809 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/url_util.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "base/debug/leak_annotations.h"
+#include "polyfills/base/logging.h"
+#include "base/no_destructor.h"
+#include "base/stl_util.h"
+#include "base/strings/string_util.h"
+#include "url/url_canon_internal.h"
+#include "url/url_constants.h"
+#include "url/url_file.h"
+#include "url/url_util_internal.h"
+
+namespace url {
+
+namespace {
+
+// List of currently registered schemes and associated properties.
+struct SchemeRegistry {
+ // Standard format schemes (see header for details).
+ std::vector<SchemeWithType> standard_schemes = {
+ {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ // Yes, file URLs can have a hostname, so file URLs should be handled as
+ // "standard". File URLs never have a port as specified by the SchemeType
+ // field. Unlike other SCHEME_WITH_HOST schemes, the 'host' in a file
+ // URL may be empty, a behavior which is special-cased during
+ // canonicalization.
+ {kFileScheme, SCHEME_WITH_HOST},
+ {kFtpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ {kGopherScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ {kWssScheme,
+ SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket secure.
+ {kWsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket.
+ {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY},
+ };
+
+ // Schemes that are allowed for referrers.
+ std::vector<SchemeWithType> referrer_schemes = {
+ {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
+ };
+
+ // Schemes that do not trigger mixed content warning.
+ std::vector<std::string> secure_schemes = {
+ kHttpsScheme,
+ kAboutScheme,
+ kDataScheme,
+ kWssScheme,
+ };
+
+ // Schemes that normal pages cannot link to or access (i.e., with the same
+ // security rules as those applied to "file" URLs).
+ std::vector<std::string> local_schemes = {
+ kFileScheme,
+ };
+
+ // Schemes that cause pages loaded with them to not have access to pages
+ // loaded with any other URL scheme.
+ std::vector<std::string> no_access_schemes = {
+ kAboutScheme,
+ kJavaScriptScheme,
+ kDataScheme,
+ };
+
+ // Schemes that can be sent CORS requests.
+ std::vector<std::string> cors_enabled_schemes = {
+ kHttpsScheme,
+ kHttpScheme,
+ kDataScheme,
+ };
+
+ // Schemes that can be used by web to store data (local storage, etc).
+ std::vector<std::string> web_storage_schemes = {
+ kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, kWssScheme, kWsScheme,
+ };
+
+ // Schemes that can bypass the Content-Security-Policy (CSP) checks.
+ std::vector<std::string> csp_bypassing_schemes = {};
+
+ // Schemes that are strictly empty documents, allowing them to commit
+ // synchronously.
+ std::vector<std::string> empty_document_schemes = {
+ kAboutScheme,
+ };
+
+ bool allow_non_standard_schemes = false;
+};
+
+SchemeRegistry* GetSchemeRegistry() {
+ static gurl_base::NoDestructor<SchemeRegistry> registry;
+ return registry.get();
+}
+
+// Pass this enum through for methods which would like to know if whitespace
+// removal is necessary.
+enum WhitespaceRemovalPolicy {
+ REMOVE_WHITESPACE,
+ DO_NOT_REMOVE_WHITESPACE,
+};
+
+// See the LockSchemeRegistries declaration in the header.
+bool scheme_registries_locked = false;
+
+// This template converts a given character type to the corresponding
+// StringPiece type.
+template<typename CHAR> struct CharToStringPiece {
+};
+template<> struct CharToStringPiece<char> {
+ typedef gurl_base::StringPiece Piece;
+};
+template<> struct CharToStringPiece<gurl_base::char16> {
+ typedef gurl_base::StringPiece16 Piece;
+};
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+template<typename CHAR>
+inline bool DoCompareSchemeComponent(const CHAR* spec,
+ const Component& component,
+ const char* compare_to) {
+ if (!component.is_nonempty())
+ return compare_to[0] == 0; // When component is empty, match empty scheme.
+ return gurl_base::LowerCaseEqualsASCII(
+ typename CharToStringPiece<CHAR>::Piece(
+ &spec[component.begin], component.len),
+ compare_to);
+}
+
+// Returns true and sets |type| to the SchemeType of the given scheme
+// identified by |scheme| within |spec| if in |schemes|.
+template<typename CHAR>
+bool DoIsInSchemes(const CHAR* spec,
+ const Component& scheme,
+ SchemeType* type,
+ const std::vector<SchemeWithType>& schemes) {
+ if (!scheme.is_nonempty())
+ return false; // Empty or invalid schemes are non-standard.
+
+ for (const SchemeWithType& scheme_with_type : schemes) {
+ if (gurl_base::LowerCaseEqualsASCII(typename CharToStringPiece<CHAR>::Piece(
+ &spec[scheme.begin], scheme.len),
+ scheme_with_type.scheme)) {
+ *type = scheme_with_type.type;
+ return true;
+ }
+ }
+ return false;
+}
+
+template<typename CHAR>
+bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) {
+ return DoIsInSchemes(spec, scheme, type,
+ GetSchemeRegistry()->standard_schemes);
+}
+
+
+template<typename CHAR>
+bool DoFindAndCompareScheme(const CHAR* str,
+ int str_len,
+ const char* compare,
+ Component* found_scheme) {
+ // Before extracting scheme, canonicalize the URL to remove any whitespace.
+ // This matches the canonicalization done in DoCanonicalize function.
+ RawCanonOutputT<CHAR> whitespace_buffer;
+ int spec_len;
+ const CHAR* spec =
+ RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr);
+
+ Component our_scheme;
+ if (!ExtractScheme(spec, spec_len, &our_scheme)) {
+ // No scheme.
+ if (found_scheme)
+ *found_scheme = Component();
+ return false;
+ }
+ if (found_scheme)
+ *found_scheme = our_scheme;
+ return DoCompareSchemeComponent(spec, our_scheme, compare);
+}
+
+template <typename CHAR>
+bool DoCanonicalize(const CHAR* spec,
+ int spec_len,
+ bool trim_path_end,
+ WhitespaceRemovalPolicy whitespace_policy,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ output->ReserveSizeIfNeeded(spec_len);
+
+ // Remove any whitespace from the middle of the relative URL if necessary.
+ // Possibly this will result in copying to the new buffer.
+ RawCanonOutputT<CHAR> whitespace_buffer;
+ if (whitespace_policy == REMOVE_WHITESPACE) {
+ spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len,
+ &output_parsed->potentially_dangling_markup);
+ }
+
+ Parsed parsed_input;
+#ifdef WIN32
+ // For Windows, we allow things that look like absolute Windows paths to be
+ // fixed up magically to file URLs. This is done for IE compatibility. For
+ // example, this will change "c:/foo" into a file URL rather than treating
+ // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
+ // There is similar logic in url_canon_relative.cc for
+ //
+ // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
+ // has no meaning as an absolute path name. This is because browsers on Mac
+ // & Unix don't generally do this, so there is no compatibility reason for
+ // doing so.
+ if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
+ DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
+ ParseFileURL(spec, spec_len, &parsed_input);
+ return CanonicalizeFileURL(spec, spec_len, parsed_input, charset_converter,
+ output, output_parsed);
+ }
+#endif
+
+ Component scheme;
+ if (!ExtractScheme(spec, spec_len, &scheme))
+ return false;
+
+ // This is the parsed version of the input URL, we have to canonicalize it
+ // before storing it in our object.
+ bool success;
+ SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
+ // File URLs are special.
+ ParseFileURL(spec, spec_len, &parsed_input);
+ success = CanonicalizeFileURL(spec, spec_len, parsed_input,
+ charset_converter, output, output_parsed);
+ } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
+ // Filesystem URLs are special.
+ ParseFileSystemURL(spec, spec_len, &parsed_input);
+ success = CanonicalizeFileSystemURL(spec, spec_len, parsed_input,
+ charset_converter, output,
+ output_parsed);
+
+ } else if (DoIsStandard(spec, scheme, &scheme_type)) {
+ // All "normal" URLs.
+ ParseStandardURL(spec, spec_len, &parsed_input);
+ success = CanonicalizeStandardURL(spec, spec_len, parsed_input, scheme_type,
+ charset_converter, output, output_parsed);
+
+ } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
+ // Mailto URLs are treated like standard URLs, with only a scheme, path,
+ // and query.
+ ParseMailtoURL(spec, spec_len, &parsed_input);
+ success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
+ output_parsed);
+
+ } else {
+ // "Weird" URLs like data: and javascript:.
+ ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
+ success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
+ output_parsed);
+ }
+ return success;
+}
+
+template<typename CHAR>
+bool DoResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const Parsed& base_parsed,
+ const CHAR* in_relative,
+ int in_relative_length,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ // Remove any whitespace from the middle of the relative URL, possibly
+ // copying to the new buffer.
+ RawCanonOutputT<CHAR> whitespace_buffer;
+ int relative_length;
+ const CHAR* relative = RemoveURLWhitespace(
+ in_relative, in_relative_length, &whitespace_buffer, &relative_length,
+ &output_parsed->potentially_dangling_markup);
+
+ bool base_is_authority_based = false;
+ bool base_is_hierarchical = false;
+ if (base_spec &&
+ base_parsed.scheme.is_nonempty()) {
+ int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon.
+ int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
+ base_spec_len);
+ base_is_authority_based = num_slashes > 1;
+ base_is_hierarchical = num_slashes > 0;
+ }
+
+ SchemeType unused_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ bool standard_base_scheme =
+ base_parsed.scheme.is_nonempty() &&
+ DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type);
+
+ bool is_relative;
+ Component relative_component;
+ if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
+ (base_is_hierarchical || standard_base_scheme),
+ &is_relative, &relative_component)) {
+ // Error resolving.
+ return false;
+ }
+
+ // Don't reserve buffer space here. Instead, reserve in DoCanonicalize and
+ // ReserveRelativeURL, to enable more accurate buffer sizes.
+
+ // Pretend for a moment that |base_spec| is a standard URL. Normally
+ // non-standard URLs are treated as PathURLs, but if the base has an
+ // authority we would like to preserve it.
+ if (is_relative && base_is_authority_based && !standard_base_scheme) {
+ Parsed base_parsed_authority;
+ ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority);
+ if (base_parsed_authority.host.is_nonempty()) {
+ RawCanonOutputT<char> temporary_output;
+ bool did_resolve_succeed =
+ ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
+ relative_component, charset_converter,
+ &temporary_output, output_parsed);
+ // The output_parsed is incorrect at this point (because it was built
+ // based on base_parsed_authority instead of base_parsed) and needs to be
+ // re-created.
+ DoCanonicalize(temporary_output.data(), temporary_output.length(), true,
+ REMOVE_WHITESPACE, charset_converter, output,
+ output_parsed);
+ return did_resolve_succeed;
+ }
+ } else if (is_relative) {
+ // Relative, resolve and canonicalize.
+ bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
+ DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
+ return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme, relative,
+ relative_component, charset_converter, output,
+ output_parsed);
+ }
+
+ // Not relative, canonicalize the input.
+ return DoCanonicalize(relative, relative_length, true,
+ DO_NOT_REMOVE_WHITESPACE, charset_converter, output,
+ output_parsed);
+}
+
+template<typename CHAR>
+bool DoReplaceComponents(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ const Replacements<CHAR>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ // If the scheme is overridden, just do a simple string substitution and
+ // re-parse the whole thing. There are lots of edge cases that we really don't
+ // want to deal with. Like what happens if I replace "http://e:8080/foo"
+ // with a file. Does it become "file:///E:/8080/foo" where the port number
+ // becomes part of the path? Parsing that string as a file URL says "yes"
+ // but almost no sane rule for dealing with the components individually would
+ // come up with that.
+ //
+ // Why allow these crazy cases at all? Programatically, there is almost no
+ // case for replacing the scheme. The most common case for hitting this is
+ // in JS when building up a URL using the location object. In this case, the
+ // JS code expects the string substitution behavior:
+ // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+ if (replacements.IsSchemeOverridden()) {
+ // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+ // the existing spec.
+ RawCanonOutput<128> scheme_replaced;
+ Component scheme_replaced_parsed;
+ CanonicalizeScheme(replacements.sources().scheme,
+ replacements.components().scheme,
+ &scheme_replaced, &scheme_replaced_parsed);
+
+ // We can assume that the input is canonicalized, which means it always has
+ // a colon after the scheme (or where the scheme would be).
+ int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+ : 1;
+ if (spec_len - spec_after_colon > 0) {
+ scheme_replaced.Append(&spec[spec_after_colon],
+ spec_len - spec_after_colon);
+ }
+
+ // We now need to completely re-parse the resulting string since its meaning
+ // may have changed with the different scheme.
+ RawCanonOutput<128> recanonicalized;
+ Parsed recanonicalized_parsed;
+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
+ REMOVE_WHITESPACE, charset_converter, &recanonicalized,
+ &recanonicalized_parsed);
+
+ // Recurse using the version with the scheme already replaced. This will now
+ // use the replacement rules for the new scheme.
+ //
+ // Warning: this code assumes that ReplaceComponents will re-check all
+ // components for validity. This is because we can't fail if DoCanonicalize
+ // failed above since theoretically the thing making it fail could be
+ // getting replaced here. If ReplaceComponents didn't re-check everything,
+ // we wouldn't know if something *not* getting replaced is a problem.
+ // If the scheme-specific replacers are made more intelligent so they don't
+ // re-check everything, we should instead re-canonicalize the whole thing
+ // after this call to check validity (this assumes replacing the scheme is
+ // much much less common than other types of replacements, like clearing the
+ // ref).
+ Replacements<CHAR> replacements_no_scheme = replacements;
+ replacements_no_scheme.SetScheme(NULL, Component());
+ return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+ recanonicalized_parsed, replacements_no_scheme,
+ charset_converter, output, out_parsed);
+ }
+
+ // TODO(csharrison): We could be smarter about size to reserve if this is done
+ // in callers below, and the code checks to see which components are being
+ // replaced, and with what length. If this ends up being a hot spot it should
+ // be changed.
+ output->ReserveSizeIfNeeded(spec_len);
+
+ // If we get here, then we know the scheme doesn't need to be replaced, so can
+ // just key off the scheme in the spec to know how to do the replacements.
+ if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
+ return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
+ out_parsed);
+ }
+ if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
+ return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
+ output, out_parsed);
+ }
+ SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ if (DoIsStandard(spec, parsed.scheme, &scheme_type)) {
+ return ReplaceStandardURL(spec, parsed, replacements, scheme_type,
+ charset_converter, output, out_parsed);
+ }
+ if (DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
+ return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
+ }
+
+ // Default is a path URL.
+ return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
+}
+
+void DoAddScheme(const char* new_scheme, std::vector<std::string>* schemes) {
+ GURL_DCHECK(schemes);
+ // If this assert triggers, it means you've called Add*Scheme after
+ // LockSchemeRegistries has been called (see the header file for
+ // LockSchemeRegistries for more).
+ //
+ // This normally means you're trying to set up a new scheme too late in your
+ // application's init process. Locate where your app does this initialization
+ // and calls LockSchemeRegistries, and add your new scheme there.
+ GURL_DCHECK(!scheme_registries_locked)
+ << "Trying to add a scheme after the lists have been locked.";
+
+ size_t scheme_len = strlen(new_scheme);
+ if (scheme_len == 0)
+ return;
+
+ GURL_DCHECK_EQ(gurl_base::ToLowerASCII(new_scheme), new_scheme);
+ schemes->push_back(std::string(new_scheme));
+}
+
+void DoAddSchemeWithType(const char* new_scheme,
+ SchemeType type,
+ std::vector<SchemeWithType>* schemes) {
+ GURL_DCHECK(schemes);
+ // If this assert triggers, it means you've called Add*Scheme after
+ // LockSchemeRegistries has been called (see the header file for
+ // LockSchemeRegistries for more).
+ //
+ // This normally means you're trying to set up a new scheme too late in your
+ // application's init process. Locate where your app does this initialization
+ // and calls LockSchemeRegistries, and add your new scheme there.
+ GURL_DCHECK(!scheme_registries_locked)
+ << "Trying to add a scheme after the lists have been locked.";
+
+ size_t scheme_len = strlen(new_scheme);
+ if (scheme_len == 0)
+ return;
+
+ GURL_DCHECK_EQ(gurl_base::ToLowerASCII(new_scheme), new_scheme);
+ // Duplicate the scheme into a new buffer and add it to the list of standard
+ // schemes. This pointer will be leaked on shutdown.
+ char* dup_scheme = new char[scheme_len + 1];
+ ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme);
+ memcpy(dup_scheme, new_scheme, scheme_len + 1);
+
+ SchemeWithType scheme_with_type;
+ scheme_with_type.scheme = dup_scheme;
+ scheme_with_type.type = type;
+ schemes->push_back(scheme_with_type);
+}
+
+} // namespace
+
+void ResetForTests() {
+ *GetSchemeRegistry() = SchemeRegistry();
+}
+
+void EnableNonStandardSchemesForAndroidWebView() {
+ GetSchemeRegistry()->allow_non_standard_schemes = true;
+}
+
+bool AllowNonStandardSchemesForAndroidWebView() {
+ return GetSchemeRegistry()->allow_non_standard_schemes;
+}
+
+void AddStandardScheme(const char* new_scheme, SchemeType type) {
+ DoAddSchemeWithType(new_scheme, type, &GetSchemeRegistry()->standard_schemes);
+}
+
+void AddReferrerScheme(const char* new_scheme, SchemeType type) {
+ DoAddSchemeWithType(new_scheme, type, &GetSchemeRegistry()->referrer_schemes);
+}
+
+void AddSecureScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->secure_schemes);
+}
+
+const std::vector<std::string>& GetSecureSchemes() {
+ return GetSchemeRegistry()->secure_schemes;
+}
+
+void AddLocalScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->local_schemes);
+}
+
+const std::vector<std::string>& GetLocalSchemes() {
+ return GetSchemeRegistry()->local_schemes;
+}
+
+void AddNoAccessScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->no_access_schemes);
+}
+
+const std::vector<std::string>& GetNoAccessSchemes() {
+ return GetSchemeRegistry()->no_access_schemes;
+}
+
+void AddCorsEnabledScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->cors_enabled_schemes);
+}
+
+const std::vector<std::string>& GetCorsEnabledSchemes() {
+ return GetSchemeRegistry()->cors_enabled_schemes;
+}
+
+void AddWebStorageScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->web_storage_schemes);
+}
+
+const std::vector<std::string>& GetWebStorageSchemes() {
+ return GetSchemeRegistry()->web_storage_schemes;
+}
+
+void AddCSPBypassingScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->csp_bypassing_schemes);
+}
+
+const std::vector<std::string>& GetCSPBypassingSchemes() {
+ return GetSchemeRegistry()->csp_bypassing_schemes;
+}
+
+void AddEmptyDocumentScheme(const char* new_scheme) {
+ DoAddScheme(new_scheme, &GetSchemeRegistry()->empty_document_schemes);
+}
+
+const std::vector<std::string>& GetEmptyDocumentSchemes() {
+ return GetSchemeRegistry()->empty_document_schemes;
+}
+
+void LockSchemeRegistries() {
+ scheme_registries_locked = true;
+}
+
+bool IsStandard(const char* spec, const Component& scheme) {
+ SchemeType unused_scheme_type;
+ return DoIsStandard(spec, scheme, &unused_scheme_type);
+}
+
+bool GetStandardSchemeType(const char* spec,
+ const Component& scheme,
+ SchemeType* type) {
+ return DoIsStandard(spec, scheme, type);
+}
+
+bool GetStandardSchemeType(const gurl_base::char16* spec,
+ const Component& scheme,
+ SchemeType* type) {
+ return DoIsStandard(spec, scheme, type);
+}
+
+bool IsStandard(const gurl_base::char16* spec, const Component& scheme) {
+ SchemeType unused_scheme_type;
+ return DoIsStandard(spec, scheme, &unused_scheme_type);
+}
+
+bool IsReferrerScheme(const char* spec, const Component& scheme) {
+ SchemeType unused_scheme_type;
+ return DoIsInSchemes(spec, scheme, &unused_scheme_type,
+ GetSchemeRegistry()->referrer_schemes);
+}
+
+bool FindAndCompareScheme(const char* str,
+ int str_len,
+ const char* compare,
+ Component* found_scheme) {
+ return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool FindAndCompareScheme(const gurl_base::char16* str,
+ int str_len,
+ const char* compare,
+ Component* found_scheme) {
+ return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool DomainIs(gurl_base::StringPiece canonical_host,
+ gurl_base::StringPiece canonical_domain) {
+ if (canonical_host.empty() || canonical_domain.empty())
+ return false;
+
+ // If the host name ends with a dot but the input domain doesn't, then we
+ // ignore the dot in the host name.
+ size_t host_len = canonical_host.length();
+ if (canonical_host.back() == '.' && canonical_domain.back() != '.')
+ --host_len;
+
+ if (host_len < canonical_domain.length())
+ return false;
+
+ // |host_first_pos| is the start of the compared part of the host name, not
+ // start of the whole host name.
+ const char* host_first_pos =
+ canonical_host.data() + host_len - canonical_domain.length();
+
+ if (gurl_base::StringPiece(host_first_pos, canonical_domain.length()) !=
+ canonical_domain) {
+ return false;
+ }
+
+ // Make sure there aren't extra characters in host before the compared part;
+ // if the host name is longer than the input domain name, then the character
+ // immediately before the compared part should be a dot. For example,
+ // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
+ if (canonical_domain[0] != '.' && host_len > canonical_domain.length() &&
+ *(host_first_pos - 1) != '.') {
+ return false;
+ }
+
+ return true;
+}
+
+bool HostIsIPAddress(gurl_base::StringPiece host) {
+ url::RawCanonOutputT<char, 128> ignored_output;
+ url::CanonHostInfo host_info;
+ url::CanonicalizeIPAddress(host.data(), Component(0, host.length()),
+ &ignored_output, &host_info);
+ return host_info.IsIPAddress();
+}
+
+bool Canonicalize(const char* spec,
+ int spec_len,
+ bool trim_path_end,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
+ charset_converter, output, output_parsed);
+}
+
+bool Canonicalize(const gurl_base::char16* spec,
+ int spec_len,
+ bool trim_path_end,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
+ charset_converter, output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const Parsed& base_parsed,
+ const char* relative,
+ int relative_length,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+ relative, relative_length,
+ charset_converter, output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const Parsed& base_parsed,
+ const gurl_base::char16* relative,
+ int relative_length,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed) {
+ return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+ relative, relative_length,
+ charset_converter, output, output_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ return DoReplaceComponents(spec, spec_len, parsed, replacements,
+ charset_converter, output, out_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* out_parsed) {
+ return DoReplaceComponents(spec, spec_len, parsed, replacements,
+ charset_converter, output, out_parsed);
+}
+
+void DecodeURLEscapeSequences(const char* input,
+ int length,
+ DecodeURLMode mode,
+ CanonOutputW* output) {
+ RawCanonOutputT<char> unescaped_chars;
+ for (int i = 0; i < length; i++) {
+ if (input[i] == '%') {
+ unsigned char ch;
+ if (DecodeEscaped(input, &i, length, &ch)) {
+ unescaped_chars.push_back(ch);
+ } else {
+ // Invalid escape sequence, copy the percent literal.
+ unescaped_chars.push_back('%');
+ }
+ } else {
+ // Regular non-escaped 8-bit character.
+ unescaped_chars.push_back(input[i]);
+ }
+ }
+
+ int output_initial_length = output->length();
+ // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+ // JavaScript URLs, but Firefox and Safari do.
+ for (int i = 0; i < unescaped_chars.length(); i++) {
+ unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+ if (uch < 0x80) {
+ // Non-UTF-8, just append directly
+ output->push_back(uch);
+ } else {
+ // next_ch will point to the last character of the decoded
+ // character.
+ int next_character = i;
+ unsigned code_point;
+ if (ReadUTFChar(unescaped_chars.data(), &next_character,
+ unescaped_chars.length(), &code_point)) {
+ // Valid UTF-8 character, convert to UTF-16.
+ AppendUTF16Value(code_point, output);
+ i = next_character;
+ } else if (mode == DecodeURLMode::kUTF8) {
+ GURL_DCHECK_EQ(code_point, 0xFFFDU);
+ AppendUTF16Value(code_point, output);
+ i = next_character;
+ } else {
+ // If there are any sequences that are not valid UTF-8, we
+ // revert |output| changes, and promote any bytes to UTF-16. We
+ // copy all characters from the beginning to the end of the
+ // identified sequence.
+ output->set_length(output_initial_length);
+ for (int j = 0; j < unescaped_chars.length(); ++j)
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(j)));
+ break;
+ }
+ }
+ }
+}
+
+void EncodeURIComponent(const char* input, int length, CanonOutput* output) {
+ for (int i = 0; i < length; ++i) {
+ unsigned char c = static_cast<unsigned char>(input[i]);
+ if (IsComponentChar(c))
+ output->push_back(c);
+ else
+ AppendEscapedChar(c, output);
+ }
+}
+
+bool CompareSchemeComponent(const char* spec,
+ const Component& component,
+ const char* compare_to) {
+ return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+bool CompareSchemeComponent(const gurl_base::char16* spec,
+ const Component& component,
+ const char* compare_to) {
+ return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+} // namespace url
diff --git a/url/url_util.h b/url/url_util.h
new file mode 100644
index 0000000..473ae5f
--- /dev/null
+++ b/url/url_util.h
@@ -0,0 +1,288 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_UTIL_H_
+#define URL_URL_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "polyfills/base/component_export.h"
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_constants.h"
+
+namespace url {
+
+// Init ------------------------------------------------------------------------
+
+// Resets all custom schemes to the default values. Not thread-safe.
+COMPONENT_EXPORT(URL) void ResetForTests();
+
+// Schemes ---------------------------------------------------------------------
+
+// Changes the behavior of SchemeHostPort / Origin to allow non-standard schemes
+// to be specified, instead of canonicalizing them to an invalid SchemeHostPort
+// or opaque Origin, respectively. This is used for Android WebView backwards
+// compatibility, which allows the use of custom schemes: content hosted in
+// Android WebView assumes that one URL with a non-standard scheme will be
+// same-origin to another URL with the same non-standard scheme.
+//
+// Not thread-safe.
+COMPONENT_EXPORT(URL) void EnableNonStandardSchemesForAndroidWebView();
+
+// Whether or not SchemeHostPort and Origin allow non-standard schemes.
+COMPONENT_EXPORT(URL) bool AllowNonStandardSchemesForAndroidWebView();
+
+// A pair for representing a standard scheme name and the SchemeType for it.
+struct COMPONENT_EXPORT(URL) SchemeWithType {
+ const char* scheme;
+ SchemeType type;
+};
+
+// The following Add*Scheme method are not threadsafe and can not be called
+// concurrently with any other url_util function. They will assert if the lists
+// of schemes have been locked (see LockSchemeRegistries).
+
+// Adds an application-defined scheme to the internal list of "standard-format"
+// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic
+// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3).
+
+COMPONENT_EXPORT(URL)
+void AddStandardScheme(const char* new_scheme, SchemeType scheme_type);
+
+// Adds an application-defined scheme to the internal list of schemes allowed
+// for referrers.
+COMPONENT_EXPORT(URL)
+void AddReferrerScheme(const char* new_scheme, SchemeType scheme_type);
+
+// Adds an application-defined scheme to the list of schemes that do not trigger
+// mixed content warnings.
+COMPONENT_EXPORT(URL) void AddSecureScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetSecureSchemes();
+
+// Adds an application-defined scheme to the list of schemes that normal pages
+// cannot link to or access (i.e., with the same security rules as those applied
+// to "file" URLs).
+COMPONENT_EXPORT(URL) void AddLocalScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetLocalSchemes();
+
+// Adds an application-defined scheme to the list of schemes that cause pages
+// loaded with them to not have access to pages loaded with any other URL
+// scheme.
+COMPONENT_EXPORT(URL) void AddNoAccessScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetNoAccessSchemes();
+
+// Adds an application-defined scheme to the list of schemes that can be sent
+// CORS requests.
+COMPONENT_EXPORT(URL) void AddCorsEnabledScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCorsEnabledSchemes();
+
+// Adds an application-defined scheme to the list of web schemes that can be
+// used by web to store data (e.g. cookies, local storage, ...). This is
+// to differentiate them from schemes that can store data but are not used on
+// web (e.g. application's internal schemes) or schemes that are used on web but
+// cannot store data.
+COMPONENT_EXPORT(URL) void AddWebStorageScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetWebStorageSchemes();
+
+// Adds an application-defined scheme to the list of schemes that can bypass the
+// Content-Security-Policy (CSP) checks.
+COMPONENT_EXPORT(URL) void AddCSPBypassingScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetCSPBypassingSchemes();
+
+// Adds an application-defined scheme to the list of schemes that are strictly
+// empty documents, allowing them to commit synchronously.
+COMPONENT_EXPORT(URL) void AddEmptyDocumentScheme(const char* new_scheme);
+COMPONENT_EXPORT(URL) const std::vector<std::string>& GetEmptyDocumentSchemes();
+
+// Sets a flag to prevent future calls to Add*Scheme from succeeding.
+//
+// This is designed to help prevent errors for multithreaded applications.
+// Normal usage would be to call Add*Scheme for your custom schemes at
+// the beginning of program initialization, and then LockSchemeRegistries. This
+// prevents future callers from mistakenly calling Add*Scheme when the
+// program is running with multiple threads, where such usage would be
+// dangerous.
+//
+// We could have had Add*Scheme use a lock instead, but that would add
+// some platform-specific dependencies we don't otherwise have now, and is
+// overkill considering the normal usage is so simple.
+COMPONENT_EXPORT(URL) void LockSchemeRegistries();
+
+// Locates the scheme in the given string and places it into |found_scheme|,
+// which may be NULL to indicate the caller does not care about the range.
+//
+// Returns whether the given |compare| scheme matches the scheme found in the
+// input (if any). The |compare| scheme must be a valid canonical scheme or
+// the result of the comparison is undefined.
+COMPONENT_EXPORT(URL)
+bool FindAndCompareScheme(const char* str,
+ int str_len,
+ const char* compare,
+ Component* found_scheme);
+COMPONENT_EXPORT(URL)
+bool FindAndCompareScheme(const gurl_base::char16* str,
+ int str_len,
+ const char* compare,
+ Component* found_scheme);
+inline bool FindAndCompareScheme(const std::string& str,
+ const char* compare,
+ Component* found_scheme) {
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+ compare, found_scheme);
+}
+inline bool FindAndCompareScheme(const gurl_base::string16& str,
+ const char* compare,
+ Component* found_scheme) {
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+ compare, found_scheme);
+}
+
+// Returns true if the given scheme identified by |scheme| within |spec| is in
+// the list of known standard-format schemes (see AddStandardScheme).
+COMPONENT_EXPORT(URL)
+bool IsStandard(const char* spec, const Component& scheme);
+COMPONENT_EXPORT(URL)
+bool IsStandard(const gurl_base::char16* spec, const Component& scheme);
+
+// Returns true if the given scheme identified by |scheme| within |spec| is in
+// the list of allowed schemes for referrers (see AddReferrerScheme).
+COMPONENT_EXPORT(URL)
+bool IsReferrerScheme(const char* spec, const Component& scheme);
+
+// Returns true and sets |type| to the SchemeType of the given scheme
+// identified by |scheme| within |spec| if the scheme is in the list of known
+// standard-format schemes (see AddStandardScheme).
+COMPONENT_EXPORT(URL)
+bool GetStandardSchemeType(const char* spec,
+ const Component& scheme,
+ SchemeType* type);
+COMPONENT_EXPORT(URL)
+bool GetStandardSchemeType(const gurl_base::char16* spec,
+ const Component& scheme,
+ SchemeType* type);
+
+// Hosts ----------------------------------------------------------------------
+
+// Returns true if the |canonical_host| matches or is in the same domain as the
+// given |canonical_domain| string. For example, if the canonicalized hostname
+// is "www.google.com", this will return true for "com", "google.com", and
+// "www.google.com" domains.
+//
+// If either of the input StringPieces is empty, the return value is false. The
+// input domain should match host canonicalization rules. i.e. it should be
+// lowercase except for escape chars.
+COMPONENT_EXPORT(URL)
+bool DomainIs(gurl_base::StringPiece canonical_host,
+ gurl_base::StringPiece canonical_domain);
+
+// Returns true if the hostname is an IP address. Note: this function isn't very
+// cheap, as it must re-parse the host to verify.
+COMPONENT_EXPORT(URL) bool HostIsIPAddress(gurl_base::StringPiece host);
+
+// URL library wrappers --------------------------------------------------------
+
+// Parses the given spec according to the extracted scheme type. Normal users
+// should use the URL object, although this may be useful if performance is
+// critical and you don't want to do the heap allocation for the std::string.
+//
+// As with the Canonicalize* functions, the charset converter can
+// be NULL to use UTF-8 (it will be faster in this case).
+//
+// Returns true if a valid URL was produced, false if not. On failure, the
+// output and parsed structures will still be filled and will be consistent,
+// but they will not represent a loadable URL.
+COMPONENT_EXPORT(URL)
+bool Canonicalize(const char* spec,
+ int spec_len,
+ bool trim_path_end,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed);
+COMPONENT_EXPORT(URL)
+bool Canonicalize(const gurl_base::char16* spec,
+ int spec_len,
+ bool trim_path_end,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed);
+
+// Resolves a potentially relative URL relative to the given parsed base URL.
+// The base MUST be valid. The resulting canonical URL and parsed information
+// will be placed in to the given out variables.
+//
+// The relative need not be relative. If we discover that it's absolute, this
+// will produce a canonical version of that URL. See Canonicalize() for more
+// about the charset_converter.
+//
+// Returns true if the output is valid, false if the input could not produce
+// a valid URL.
+COMPONENT_EXPORT(URL)
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const Parsed& base_parsed,
+ const char* relative,
+ int relative_length,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed);
+COMPONENT_EXPORT(URL)
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const Parsed& base_parsed,
+ const gurl_base::char16* relative,
+ int relative_length,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* output_parsed);
+
+// Replaces components in the given VALID input URL. The new canonical URL info
+// is written to output and out_parsed.
+//
+// Returns true if the resulting URL is valid.
+COMPONENT_EXPORT(URL)
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
+COMPONENT_EXPORT(URL)
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const Parsed& parsed,
+ const Replacements<gurl_base::char16>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ Parsed* out_parsed);
+
+// String helper functions -----------------------------------------------------
+
+enum class DecodeURLMode {
+ // UTF-8 decode only. Invalid byte sequences are replaced with U+FFFD.
+ kUTF8,
+ // Try UTF-8 decoding. If the input contains byte sequences invalid
+ // for UTF-8, apply byte to Unicode mapping.
+ kUTF8OrIsomorphic,
+};
+
+// Unescapes the given string using URL escaping rules.
+COMPONENT_EXPORT(URL)
+void DecodeURLEscapeSequences(const char* input,
+ int length,
+ DecodeURLMode mode,
+ CanonOutputW* output);
+
+// Escapes the given string as defined by the JS method encodeURIComponent. See
+// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
+COMPONENT_EXPORT(URL)
+void EncodeURIComponent(const char* input, int length, CanonOutput* output);
+
+} // namespace url
+
+#endif // URL_URL_UTIL_H_
diff --git a/url/url_util_internal.h b/url/url_util_internal.h
new file mode 100644
index 0000000..08f8929
--- /dev/null
+++ b/url/url_util_internal.h
@@ -0,0 +1,26 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_URL_UTIL_INTERNAL_H_
+#define URL_URL_UTIL_INTERNAL_H_
+
+#include <string>
+
+#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
+
+namespace url {
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+bool CompareSchemeComponent(const char* spec,
+ const Component& component,
+ const char* compare_to);
+bool CompareSchemeComponent(const gurl_base::char16* spec,
+ const Component& component,
+ const char* compare_to);
+
+} // namespace url
+
+#endif // URL_URL_UTIL_INTERNAL_H_
diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc
new file mode 100644
index 0000000..741c1dc
--- /dev/null
+++ b/url/url_util_unittest.cc
@@ -0,0 +1,527 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+
+#include "base/stl_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_test_utils.h"
+#include "url/url_util.h"
+
+namespace url {
+
+class URLUtilTest : public testing::Test {
+ public:
+ URLUtilTest() = default;
+ ~URLUtilTest() override {
+ // Reset any added schemes.
+ ResetForTests();
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(URLUtilTest);
+};
+
+TEST_F(URLUtilTest, FindAndCompareScheme) {
+ Component found_scheme;
+
+ // Simple case where the scheme is found and matches.
+ const char kStr1[] = "http://www.com/";
+ EXPECT_TRUE(FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "http", NULL));
+ EXPECT_TRUE(FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component(0, 4));
+
+ // A case where the scheme is found and doesn't match.
+ EXPECT_FALSE(FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component(0, 4));
+
+ // A case where there is no scheme.
+ const char kStr2[] = "httpfoobar";
+ EXPECT_FALSE(FindAndCompareScheme(
+ kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component());
+
+ // When there is an empty scheme, it should match the empty scheme.
+ const char kStr3[] = ":foo.com/";
+ EXPECT_TRUE(FindAndCompareScheme(
+ kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component(0, 0));
+
+ // But when there is no scheme, it should fail.
+ EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component());
+
+ // When there is a whitespace char in scheme, it should canonicalize the URL
+ // before comparison.
+ const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
+ EXPECT_TRUE(FindAndCompareScheme(whtspc_str,
+ static_cast<int>(strlen(whtspc_str)),
+ "javascript", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component(1, 10));
+
+ // Control characters should be stripped out on the ends, and kept in the
+ // middle.
+ const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)";
+ EXPECT_FALSE(FindAndCompareScheme(ctrl_str,
+ static_cast<int>(strlen(ctrl_str)),
+ "javascript", &found_scheme));
+ EXPECT_TRUE(found_scheme == Component(1, 11));
+}
+
+TEST_F(URLUtilTest, IsStandard) {
+ const char kHTTPScheme[] = "http";
+ EXPECT_TRUE(IsStandard(kHTTPScheme, Component(0, strlen(kHTTPScheme))));
+
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsStandard(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST_F(URLUtilTest, IsReferrerScheme) {
+ const char kHTTPScheme[] = "http";
+ EXPECT_TRUE(IsReferrerScheme(kHTTPScheme, Component(0, strlen(kHTTPScheme))));
+
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST_F(URLUtilTest, AddReferrerScheme) {
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+
+ AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST);
+ EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST_F(URLUtilTest, ShutdownCleansUpSchemes) {
+ const char kFooScheme[] = "foo";
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+
+ AddReferrerScheme(kFooScheme, url::SCHEME_WITH_HOST);
+ EXPECT_TRUE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+
+ ResetForTests();
+ EXPECT_FALSE(IsReferrerScheme(kFooScheme, Component(0, strlen(kFooScheme))));
+}
+
+TEST_F(URLUtilTest, GetStandardSchemeType) {
+ url::SchemeType scheme_type;
+
+ const char kHTTPScheme[] = "http";
+ scheme_type = url::SCHEME_WITHOUT_AUTHORITY;
+ EXPECT_TRUE(GetStandardSchemeType(kHTTPScheme,
+ Component(0, strlen(kHTTPScheme)),
+ &scheme_type));
+ EXPECT_EQ(url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, scheme_type);
+
+ const char kFilesystemScheme[] = "filesystem";
+ scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ EXPECT_TRUE(GetStandardSchemeType(kFilesystemScheme,
+ Component(0, strlen(kFilesystemScheme)),
+ &scheme_type));
+ EXPECT_EQ(url::SCHEME_WITHOUT_AUTHORITY, scheme_type);
+
+ const char kFooScheme[] = "foo";
+ scheme_type = url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+ EXPECT_FALSE(GetStandardSchemeType(kFooScheme,
+ Component(0, strlen(kFooScheme)),
+ &scheme_type));
+}
+
+TEST_F(URLUtilTest, ReplaceComponents) {
+ Parsed parsed;
+ RawCanonOutputT<char> output;
+ Parsed new_parsed;
+
+ // Check that the following calls do not cause crash
+ Replacements<char> replacements;
+ replacements.SetRef("test", Component(0, 4));
+ ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed);
+ ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed);
+ replacements.ClearRef();
+ replacements.SetHost("test", Component(0, 4));
+ ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed);
+ ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed);
+
+ replacements.ClearHost();
+ ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed);
+ ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed);
+ ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, &new_parsed);
+ ReplaceComponents("", 0, parsed, replacements, NULL, &output, &new_parsed);
+}
+
+static std::string CheckReplaceScheme(const char* base_url,
+ const char* scheme) {
+ // Make sure the input is canonicalized.
+ RawCanonOutput<32> original;
+ Parsed original_parsed;
+ Canonicalize(base_url, strlen(base_url), true, NULL, &original,
+ &original_parsed);
+
+ Replacements<char> replacements;
+ replacements.SetScheme(scheme, Component(0, strlen(scheme)));
+
+ std::string output_string;
+ StdStringCanonOutput output(&output_string);
+ Parsed output_parsed;
+ ReplaceComponents(original.data(), original.length(), original_parsed,
+ replacements, NULL, &output, &output_parsed);
+
+ output.Complete();
+ return output_string;
+}
+
+TEST_F(URLUtilTest, ReplaceScheme) {
+ EXPECT_EQ("https://google.com/",
+ CheckReplaceScheme("http://google.com/", "https"));
+ EXPECT_EQ("file://google.com/",
+ CheckReplaceScheme("http://google.com/", "file"));
+ EXPECT_EQ("http://home/Build",
+ CheckReplaceScheme("file:///Home/Build", "http"));
+ EXPECT_EQ("javascript:foo",
+ CheckReplaceScheme("about:foo", "javascript"));
+ EXPECT_EQ("://google.com/",
+ CheckReplaceScheme("http://google.com/", ""));
+ EXPECT_EQ("http://google.com/",
+ CheckReplaceScheme("about:google.com", "http"));
+ EXPECT_EQ("http:", CheckReplaceScheme("", "http"));
+
+#ifdef WIN32
+ // Magic Windows drive letter behavior when converting to a file URL.
+ EXPECT_EQ("file:///E:/foo/",
+ CheckReplaceScheme("http://localhost/e:foo/", "file"));
+#endif
+
+ // This will probably change to "about://google.com/" when we fix
+ // http://crbug.com/160 which should also be an acceptable result.
+ EXPECT_EQ("about://google.com/",
+ CheckReplaceScheme("http://google.com/", "about"));
+
+ EXPECT_EQ("http://example.com/%20hello%20#%20world",
+ CheckReplaceScheme("myscheme:example.com/ hello # world ", "http"));
+}
+
+TEST_F(URLUtilTest, DecodeURLEscapeSequences) {
+ struct DecodeCase {
+ const char* input;
+ const char* output;
+ } decode_cases[] = {
+ {"hello, world", "hello, world"},
+ {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"},
+ {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/",
+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"},
+ {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/",
+ " !\"#$%&'()*+,-.//"},
+ {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/",
+ "0123456789:;<=>?/"},
+ {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/",
+ "@ABCDEFGHIJKLMNO/"},
+ {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/",
+ "PQRSTUVWXYZ[\\]^_/"},
+ {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/",
+ "`abcdefghijklmno/"},
+ {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/",
+ "pqrstuvwxyz{|}~\x7f/"},
+ {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(decode_cases); i++) {
+ const char* input = decode_cases[i].input;
+ RawCanonOutputT<gurl_base::char16> output;
+ DecodeURLEscapeSequences(input, strlen(input),
+ DecodeURLMode::kUTF8OrIsomorphic, &output);
+ EXPECT_EQ(decode_cases[i].output,
+ gurl_base::UTF16ToUTF8(gurl_base::string16(output.data(),
+ output.length())));
+
+ RawCanonOutputT<gurl_base::char16> output_utf8;
+ DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8,
+ &output_utf8);
+ EXPECT_EQ(decode_cases[i].output,
+ gurl_base::UTF16ToUTF8(
+ gurl_base::string16(output_utf8.data(), output_utf8.length())));
+ }
+
+ // Our decode should decode %00
+ const char zero_input[] = "%00";
+ RawCanonOutputT<gurl_base::char16> zero_output;
+ DecodeURLEscapeSequences(zero_input, strlen(zero_input), DecodeURLMode::kUTF8,
+ &zero_output);
+ EXPECT_NE("%00", gurl_base::UTF16ToUTF8(
+ gurl_base::string16(zero_output.data(), zero_output.length())));
+
+ // Test the error behavior for invalid UTF-8.
+ struct Utf8DecodeCase {
+ const char* input;
+ std::vector<gurl_base::char16> expected_iso;
+ std::vector<gurl_base::char16> expected_utf8;
+ } utf8_decode_cases[] = {
+ // %e5%a5%bd is a valid UTF-8 sequence. U+597D
+ {"%e4%a0%e5%a5%bd",
+ {0x00e4, 0x00a0, 0x00e5, 0x00a5, 0x00bd, 0},
+ {0xfffd, 0x597d, 0}},
+ {"%e5%a5%bd%e4%a0",
+ {0x00e5, 0x00a5, 0x00bd, 0x00e4, 0x00a0, 0},
+ {0x597d, 0xfffd, 0}},
+ {"%e4%a0%e5%bd",
+ {0x00e4, 0x00a0, 0x00e5, 0x00bd, 0},
+ {0xfffd, 0xfffd, 0}},
+ };
+
+ for (const auto& test : utf8_decode_cases) {
+ const char* input = test.input;
+ RawCanonOutputT<gurl_base::char16> output_iso;
+ DecodeURLEscapeSequences(input, strlen(input),
+ DecodeURLMode::kUTF8OrIsomorphic, &output_iso);
+ EXPECT_EQ(gurl_base::string16(test.expected_iso.data()),
+ gurl_base::string16(output_iso.data(), output_iso.length()));
+
+ RawCanonOutputT<gurl_base::char16> output_utf8;
+ DecodeURLEscapeSequences(input, strlen(input), DecodeURLMode::kUTF8,
+ &output_utf8);
+ EXPECT_EQ(gurl_base::string16(test.expected_utf8.data()),
+ gurl_base::string16(output_utf8.data(), output_utf8.length()));
+ }
+}
+
+TEST_F(URLUtilTest, TestEncodeURIComponent) {
+ struct EncodeCase {
+ const char* input;
+ const char* output;
+ } encode_cases[] = {
+ {"hello, world", "hello%2C%20world"},
+ {"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+ "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"},
+ {"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
+ "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"},
+ {" !\"#$%&'()*+,-./",
+ "%20!%22%23%24%25%26%27()*%2B%2C-.%2F"},
+ {"0123456789:;<=>?",
+ "0123456789%3A%3B%3C%3D%3E%3F"},
+ {"@ABCDEFGHIJKLMNO",
+ "%40ABCDEFGHIJKLMNO"},
+ {"PQRSTUVWXYZ[\\]^_",
+ "PQRSTUVWXYZ%5B%5C%5D%5E_"},
+ {"`abcdefghijklmno",
+ "%60abcdefghijklmno"},
+ {"pqrstuvwxyz{|}~\x7f",
+ "pqrstuvwxyz%7B%7C%7D~%7F"},
+ };
+
+ for (size_t i = 0; i < gurl_base::size(encode_cases); i++) {
+ const char* input = encode_cases[i].input;
+ RawCanonOutputT<char> buffer;
+ EncodeURIComponent(input, strlen(input), &buffer);
+ std::string output(buffer.data(), buffer.length());
+ EXPECT_EQ(encode_cases[i].output, output);
+ }
+}
+
+TEST_F(URLUtilTest, TestResolveRelativeWithNonStandardBase) {
+ // This tests non-standard (in the sense that IsStandard() == false)
+ // hierarchical schemes.
+ struct ResolveRelativeCase {
+ const char* base;
+ const char* rel;
+ bool is_valid;
+ const char* out;
+ } resolve_non_standard_cases[] = {
+ // Resolving a relative path against a non-hierarchical URL should fail.
+ {"scheme:opaque_data", "/path", false, ""},
+ // Resolving a relative path against a non-standard authority-based base
+ // URL doesn't alter the authority section.
+ {"scheme://Authority/", "../path", true, "scheme://Authority/path"},
+ // A non-standard hierarchical base is resolved with path URL
+ // canonicalization rules.
+ {"data:/Blah:Blah/", "file.html", true, "data:/Blah:Blah/file.html"},
+ {"data:/Path/../part/part2", "file.html", true,
+ "data:/Path/../part/file.html"},
+ {"data://text/html,payload", "//user:pass@host:33////payload22", true,
+ "data://user:pass@host:33////payload22"},
+ // Path URL canonicalization rules also apply to non-standard authority-
+ // based URLs.
+ {"custom://Authority/", "file.html", true,
+ "custom://Authority/file.html"},
+ {"custom://Authority/", "other://Auth/", true, "other://Auth/"},
+ {"custom://Authority/", "../../file.html", true,
+ "custom://Authority/file.html"},
+ {"custom://Authority/path/", "file.html", true,
+ "custom://Authority/path/file.html"},
+ {"custom://Authority:NoCanon/path/", "file.html", true,
+ "custom://Authority:NoCanon/path/file.html"},
+ // It's still possible to get an invalid path URL.
+ {"custom://Invalid:!#Auth/", "file.html", false, ""},
+ // A path with an authority section gets canonicalized under standard URL
+ // rules, even though the base was non-standard.
+ {"content://content.Provider/", "//other.Provider", true,
+ "content://other.provider/"},
+
+ // Resolving an absolute URL doesn't cause canonicalization of the
+ // result.
+ {"about:blank", "custom://Authority", true, "custom://Authority"},
+ // Fragment URLs can be resolved against a non-standard base.
+ {"scheme://Authority/path", "#fragment", true,
+ "scheme://Authority/path#fragment"},
+ {"scheme://Authority/", "#fragment", true,
+ "scheme://Authority/#fragment"},
+ // Resolving should fail if the base URL is authority-based but is
+ // missing a path component (the '/' at the end).
+ {"scheme://Authority", "path", false, ""},
+ // Test resolving a fragment (only) against any kind of base-URL.
+ {"about:blank", "#id42", true, "about:blank#id42"},
+ {"about:blank", " #id42", true, "about:blank#id42"},
+ {"about:blank#oldfrag", "#newfrag", true, "about:blank#newfrag"},
+ // A surprising side effect of allowing fragments to resolve against
+ // any URL scheme is we might break javascript: URLs by doing so...
+ {"javascript:alert('foo#bar')", "#badfrag", true,
+ "javascript:alert('foo#badfrag"},
+ // In this case, the backslashes will not be canonicalized because it's a
+ // non-standard URL, but they will be treated as a path separators,
+ // giving the base URL here a path of "\".
+ //
+ // The result here is somewhat arbitrary. One could argue it should be
+ // either "aaa://a\" or "aaa://a/" since the path is being replaced with
+ // the "current directory". But in the context of resolving on data URLs,
+ // adding the requested dot doesn't seem wrong either.
+ {"aaa://a\\", "aaa:.", true, "aaa://a\\."}};
+
+ for (size_t i = 0; i < gurl_base::size(resolve_non_standard_cases); i++) {
+ const ResolveRelativeCase& test_data = resolve_non_standard_cases[i];
+ Parsed base_parsed;
+ ParsePathURL(test_data.base, strlen(test_data.base), false, &base_parsed);
+
+ std::string resolved;
+ StdStringCanonOutput output(&resolved);
+ Parsed resolved_parsed;
+ bool valid = ResolveRelative(test_data.base, strlen(test_data.base),
+ base_parsed, test_data.rel,
+ strlen(test_data.rel), NULL, &output,
+ &resolved_parsed);
+ output.Complete();
+
+ EXPECT_EQ(test_data.is_valid, valid) << i;
+ if (test_data.is_valid && valid)
+ EXPECT_EQ(test_data.out, resolved) << i;
+ }
+}
+
+TEST_F(URLUtilTest, TestNoRefComponent) {
+ // The hash-mark must be ignored when mailto: scheme is parsed,
+ // even if the URL has a base and relative part.
+ const char* base = "mailto://to/";
+ const char* rel = "any#body";
+
+ Parsed base_parsed;
+ ParsePathURL(base, strlen(base), false, &base_parsed);
+
+ std::string resolved;
+ StdStringCanonOutput output(&resolved);
+ Parsed resolved_parsed;
+
+ bool valid = ResolveRelative(base, strlen(base),
+ base_parsed, rel,
+ strlen(rel), NULL, &output,
+ &resolved_parsed);
+ EXPECT_TRUE(valid);
+ EXPECT_FALSE(resolved_parsed.ref.is_valid());
+}
+
+TEST_F(URLUtilTest, PotentiallyDanglingMarkup) {
+ struct ResolveRelativeCase {
+ const char* base;
+ const char* rel;
+ bool potentially_dangling_markup;
+ const char* out;
+ } cases[] = {
+ {"https://example.com/", "/path<", false, "https://example.com/path%3C"},
+ {"https://example.com/", "\n/path<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "\r/path<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "\t/path<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/pa\nth<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/pa\rth<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/pa\tth<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/path\n<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"},
+ {"https://example.com/", "\n/<path", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "\r/<path", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "\t/<path", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<pa\nth", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<pa\rth", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<pa\tth", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<path\n", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"},
+ {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(::testing::Message() << test.base << ", " << test.rel);
+ Parsed base_parsed;
+ ParseStandardURL(test.base, strlen(test.base), &base_parsed);
+
+ std::string resolved;
+ StdStringCanonOutput output(&resolved);
+ Parsed resolved_parsed;
+ bool valid =
+ ResolveRelative(test.base, strlen(test.base), base_parsed, test.rel,
+ strlen(test.rel), NULL, &output, &resolved_parsed);
+ ASSERT_TRUE(valid);
+ output.Complete();
+
+ EXPECT_EQ(test.potentially_dangling_markup,
+ resolved_parsed.potentially_dangling_markup);
+ EXPECT_EQ(test.out, resolved);
+ }
+}
+
+TEST_F(URLUtilTest, TestDomainIs) {
+ const struct {
+ const char* canonicalized_host;
+ const char* lower_ascii_domain;
+ bool expected_domain_is;
+ } kTestCases[] = {
+ {"google.com", "google.com", true},
+ {"www.google.com", "google.com", true}, // Subdomain is ignored.
+ {"www.google.com.cn", "google.com", false}, // Different TLD.
+ {"www.google.comm", "google.com", false},
+ {"www.iamnotgoogle.com", "google.com", false}, // Different hostname.
+ {"www.google.com", "Google.com", false}, // The input is not lower-cased.
+
+ // If the host ends with a dot, it matches domains with or without a dot.
+ {"www.google.com.", "google.com", true},
+ {"www.google.com.", "google.com.", true},
+ {"www.google.com.", ".com", true},
+ {"www.google.com.", ".com.", true},
+
+ // But, if the host doesn't end with a dot and the input domain does, then
+ // it's considered to not match.
+ {"www.google.com", "google.com.", false},
+
+ // If the host ends with two dots, it doesn't match.
+ {"www.google.com..", "google.com", false},
+
+ // Empty parameters.
+ {"www.google.com", "", false},
+ {"", "www.google.com", false},
+ {"", "", false},
+ };
+
+ for (const auto& test_case : kTestCases) {
+ SCOPED_TRACE(testing::Message() << "(host, domain): ("
+ << test_case.canonicalized_host << ", "
+ << test_case.lower_ascii_domain << ")");
+
+ EXPECT_EQ(
+ test_case.expected_domain_is,
+ DomainIs(test_case.canonicalized_host, test_case.lower_ascii_domain));
+ }
+}
+
+} // namespace url